Total coverage: 152261 (9%)of 1848375
3 5 5 5 5 8 8 2 8 12 6 1 3 5 5 5 3 2 2 2 3 6 5 5 1 1 3 3 3 2 1 1 9 6 5 1 5 4 14 3 3 9 9 10 3 2 2 1 6 2 5 5 2 1 2 2 7 1 3 4 3 1 5 5 10 10 10 10 16 1 5 1 11 4 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> /* * Functions to manage BPF programs attached to netns */ struct bpf_netns_link { struct bpf_link link; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet * pre_exit callback to clear this pointer. Must be accessed * with netns_bpf_mutex held. */ struct net *net; struct list_head node; /* node in list of links attached to net */ enum netns_bpf_attach_type netns_type; }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; #endif default: break; } } static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; #endif default: break; } } /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) { struct bpf_prog_array *run_array; run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); } static int link_index(struct net *net, enum netns_bpf_attach_type type, struct bpf_netns_link *link) { struct bpf_netns_link *pos; int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { if (pos == link) return i; i++; } return -ENOENT; } static int link_count(struct net *net, enum netns_bpf_attach_type type) { struct list_head *pos; int i = 0; list_for_each(pos, &net->bpf.links[type]) i++; return i; } static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog_array *prog_array) { struct bpf_netns_link *pos; unsigned int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { prog_array->items[i].prog = pos->link.prog; i++; } } static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *old_array, *new_array; struct net *net; int cnt, idx; mutex_lock(&netns_bpf_mutex); /* We can race with cleanup_net, but if we see a non-NULL * struct net pointer, pre_exit has not run yet and wait for * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; /* Mark attach point as unused */ netns_bpf_attach_type_unneed(type); /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); cnt = link_count(net, type); if (!cnt) { netns_bpf_run_array_detach(net, type); goto out_unlock; } old_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!new_array) { WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); goto out_unlock; } fill_prog_array(net, type, new_array); rcu_assign_pointer(net->bpf.run_array[type], new_array); bpf_prog_array_free(old_array); out_unlock: net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } static int bpf_netns_link_detach(struct bpf_link *link) { bpf_netns_link_release(link); return 0; } static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); kfree(net_link); } static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; if (new_prog->type != link->prog->type) return -EINVAL; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (!net || !check_net(net)) { /* Link auto-detached or netns dying */ ret = -ENOLINK; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); idx = link_index(net, type, net_link); ret = bpf_prog_array_update_at(run_array, idx, new_prog); if (ret) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } static int bpf_netns_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); unsigned int inum = 0; struct net *net; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (net && check_net(net)) inum = net->ns.inum; mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; info->netns.attach_type = link->attach_type; return 0; } static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_link_info info = {}; bpf_netns_link_fill_info(link, &info); seq_printf(seq, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, link->attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, }; /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct net *net, enum netns_bpf_attach_type type) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); struct bpf_prog_array *run_array; u32 prog_cnt = 0, flags = 0; run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) prog_cnt = bpf_prog_array_length(run_array); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) return -EFAULT; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) return 0; return bpf_prog_array_copy_to_user(run_array, prog_ids, attr->query.prog_cnt); } int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { enum netns_bpf_attach_type type; struct net *net; int ret; if (attr->query.query_flags) return -EINVAL; type = to_netns_bpf_attach_type(attr->query.attach_type); if (type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_query(attr, uattr, net, type); mutex_unlock(&netns_bpf_mutex); put_net(net); return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; int ret; if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } if (ret) goto out_unlock; attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) { WRITE_ONCE(run_array->items[0].prog, prog); } else { run_array = bpf_prog_array_alloc(1, GFP_KERNEL); if (!run_array) { ret = -ENOMEM; goto out_unlock; } run_array->items[0].prog = prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); } net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; struct bpf_prog *prog; int ret; if (attr->target_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); bpf_prog_put(prog); return ret; } static int netns_bpf_max_progs(enum netns_bpf_attach_type type) { switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; case NETNS_BPF_SK_LOOKUP: return 64; default: return 0; } } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int cnt, err; mutex_lock(&netns_bpf_mutex); cnt = link_count(net, type); if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; case NETNS_BPF_SK_LOOKUP: err = 0; /* nothing to check */ break; default: err = -EINVAL; break; } if (err) goto out_unlock; run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } list_add_tail(&net_link->node, &net->bpf.links[type]); fill_prog_array(net, type, run_array); run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); /* Mark attach point as used */ netns_bpf_attach_type_need(type); out_unlock: mutex_unlock(&netns_bpf_mutex); return err; } int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type netns_type; struct bpf_link_primer link_primer; struct bpf_netns_link *net_link; enum bpf_attach_type type; struct net *net; int err; if (attr->link_create.flags) return -EINVAL; type = attr->link_create.attach_type; netns_type = to_netns_bpf_attach_type(type); if (netns_type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->link_create.target_fd); if (IS_ERR(net)) return PTR_ERR(net); net_link = kzalloc(sizeof(*net_link), GFP_USER); if (!net_link) { err = -ENOMEM; goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, &bpf_netns_link_ops, prog, type); net_link->net = net; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); if (err) { kfree(net_link); goto out_put_net; } err = netns_bpf_link_attach(net, &net_link->link, netns_type); if (err) { bpf_link_cleanup(&link_primer); goto out_put_net; } put_net(net); return bpf_link_settle(&link_primer); out_put_net: put_net(net); return err; } static int __net_init netns_bpf_pernet_init(struct net *net) { int type; for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) INIT_LIST_HEAD(&net->bpf.links[type]); return 0; } static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ netns_bpf_attach_type_unneed(type); } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; static int __init netns_bpf_init(void) { return register_pernet_subsys(&netns_bpf_pernet_ops); } subsys_initcall(netns_bpf_init);
5 3 5 5 1 1 3 4 5 5 3 2 1 1 3 3 1 2 3 3 3 1 1 4 4 4 4 8 8 3 5 5 1 53 53 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/tcp_states.h> #include <linux/nfc.h> #include <linux/export.h> #include <linux/kcov.h> #include "nfc.h" static struct nfc_sock_list raw_sk_list = { .lock = __RW_LOCK_UNLOCKED(raw_sk_list.lock) }; static void nfc_sock_link(struct nfc_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } static void nfc_sock_unlink(struct nfc_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } static void rawsock_write_queue_purge(struct sock *sk) { pr_debug("sk=%p\n", sk); spin_lock_bh(&sk->sk_write_queue.lock); __skb_queue_purge(&sk->sk_write_queue); nfc_rawsock(sk)->tx_work_scheduled = false; spin_unlock_bh(&sk->sk_write_queue.lock); } static void rawsock_report_error(struct sock *sk, int err) { pr_debug("sk=%p err=%d\n", sk, err); sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_err = -err; sk_error_report(sk); rawsock_write_queue_purge(sk); } static int rawsock_release(struct socket *sock) { struct sock *sk = sock->sk; pr_debug("sock=%p sk=%p\n", sock, sk); if (!sk) return 0; if (sock->type == SOCK_RAW) nfc_sock_unlink(&raw_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, int len, int flags) { struct sock *sk = sock->sk; struct sockaddr_nfc *addr = (struct sockaddr_nfc *)_addr; struct nfc_dev *dev; int rc = 0; pr_debug("sock=%p sk=%p flags=%d\n", sock, sk, flags); if (!addr || len < sizeof(struct sockaddr_nfc) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sock->state == SS_CONNECTED) { rc = -EISCONN; goto error; } dev = nfc_get_device(addr->dev_idx); if (!dev) { rc = -ENODEV; goto error; } if (addr->target_idx > dev->target_next_idx - 1 || addr->target_idx < dev->target_next_idx - dev->n_targets) { rc = -EINVAL; goto put_dev; } rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol); if (rc) goto put_dev; nfc_rawsock(sk)->dev = dev; nfc_rawsock(sk)->target_idx = addr->target_idx; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; sk->sk_state_change(sk); release_sock(sk); return 0; put_dev: nfc_put_device(dev); error: release_sock(sk); return rc; } static int rawsock_add_header(struct sk_buff *skb) { *(u8 *)skb_push(skb, NFC_HEADER_SIZE) = 0; return 0; } static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb, int err) { struct sock *sk = (struct sock *) context; BUG_ON(in_hardirq()); pr_debug("sk=%p err=%d\n", sk, err); if (err) goto error; err = rawsock_add_header(skb); if (err) goto error_skb; err = sock_queue_rcv_skb(sk, skb); if (err) goto error_skb; spin_lock_bh(&sk->sk_write_queue.lock); if (!skb_queue_empty(&sk->sk_write_queue)) schedule_work(&nfc_rawsock(sk)->tx_work); else nfc_rawsock(sk)->tx_work_scheduled = false; spin_unlock_bh(&sk->sk_write_queue.lock); sock_put(sk); return; error_skb: kfree_skb(skb); error: rawsock_report_error(sk, err); sock_put(sk); } static void rawsock_tx_work(struct work_struct *work) { struct sock *sk = to_rawsock_sk(work); struct nfc_dev *dev = nfc_rawsock(sk)->dev; u32 target_idx = nfc_rawsock(sk)->target_idx; struct sk_buff *skb; int rc; pr_debug("sk=%p target_idx=%u\n", sk, target_idx); if (sk->sk_shutdown & SEND_SHUTDOWN) { rawsock_write_queue_purge(sk); return; } skb = skb_dequeue(&sk->sk_write_queue); kcov_remote_start_common(skb_get_kcov_handle(skb)); sock_hold(sk); rc = nfc_data_exchange(dev, target_idx, skb, rawsock_data_exchange_complete, sk); if (rc) { rawsock_report_error(sk, rc); sock_put(sk); } kcov_remote_stop(); } static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_dev *dev = nfc_rawsock(sk)->dev; struct sk_buff *skb; int rc; pr_debug("sock=%p sk=%p len=%zu\n", sock, sk, len); if (msg->msg_namelen) return -EOPNOTSUPP; if (sock->state != SS_CONNECTED) return -ENOTCONN; skb = nfc_alloc_send_skb(dev, sk, msg->msg_flags, len, &rc); if (skb == NULL) return rc; rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); return rc; } spin_lock_bh(&sk->sk_write_queue.lock); __skb_queue_tail(&sk->sk_write_queue, skb); if (!nfc_rawsock(sk)->tx_work_scheduled) { schedule_work(&nfc_rawsock(sk)->tx_work); nfc_rawsock(sk)->tx_work_scheduled = true; } spin_unlock_bh(&sk->sk_write_queue.lock); return len; } static int rawsock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied; int rc; pr_debug("sock=%p sk=%p len=%zu flags=%d\n", sock, sk, len, flags); skb = skb_recv_datagram(sk, flags, &rc); if (!skb) return rc; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); return rc ? : copied; } static const struct proto_ops rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .release = rawsock_release, .bind = sock_no_bind, .connect = rawsock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = rawsock_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops rawsock_raw_ops = { .family = PF_NFC, .owner = THIS_MODULE, .release = rawsock_release, .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = rawsock_recvmsg, .mmap = sock_no_mmap, }; static void rawsock_destruct(struct sock *sk) { pr_debug("sk=%p\n", sk); if (sk->sk_state == TCP_ESTABLISHED) { nfc_deactivate_target(nfc_rawsock(sk)->dev, nfc_rawsock(sk)->target_idx, NFC_TARGET_MODE_IDLE); nfc_put_device(nfc_rawsock(sk)->dev); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC raw socket %p\n", sk); return; } } static int rawsock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("sock=%p\n", sock); if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW)) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; sock->ops = &rawsock_raw_ops; } else { sock->ops = &rawsock_ops; } sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sk->sk_protocol = nfc_proto->id; sk->sk_destruct = rawsock_destruct; sock->state = SS_UNCONNECTED; if (sock->type == SOCK_RAW) nfc_sock_link(&raw_sk_list, sk); else { INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work); nfc_rawsock(sk)->tx_work_scheduled = false; } return 0; } void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, u8 payload_type, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&raw_sk_list.lock); sk_for_each(sk, &raw_sk_list.head) { if (!skb_copy) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (!skb_copy) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = dev ? dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (payload_type << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&raw_sk_list.lock); kfree_skb(skb_copy); } EXPORT_SYMBOL(nfc_send_to_raw_sock); static struct proto rawsock_proto = { .name = "NFC_RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_rawsock), }; static const struct nfc_protocol rawsock_nfc_proto = { .id = NFC_SOCKPROTO_RAW, .proto = &rawsock_proto, .owner = THIS_MODULE, .create = rawsock_create }; int __init rawsock_init(void) { int rc; rc = nfc_proto_register(&rawsock_nfc_proto); return rc; } void rawsock_exit(void) { nfc_proto_unregister(&rawsock_nfc_proto); }
5 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_lookup { struct nft_set *set; u8 sreg; u8 dreg; bool dreg_set; bool invert; struct nft_set_binding binding; }; static const struct nft_set_ext * __nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { #ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) return nft_hash_lookup(net, set, key); if (set->ops == &nft_set_rhash_type.ops) return nft_rhash_lookup(net, set, key); if (set->ops == &nft_set_bitmap_type.ops) return nft_bitmap_lookup(net, set, key); if (set->ops == &nft_set_pipapo_type.ops) return nft_pipapo_lookup(net, set, key); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) return nft_pipapo_avx2_lookup(net, set, key); #endif if (set->ops == &nft_set_rbtree_type.ops) return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); #endif return set->ops->lookup(net, set, key); } static unsigned int nft_base_seq(const struct net *net) { /* pairs with smp_store_release() in nf_tables_commit() */ return smp_load_acquire(&net->nft.base_seq); } static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) { return unlikely(seq != nft_base_seq(net)); } const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { const struct nft_set_ext *ext; unsigned int base_seq; do { base_seq = nft_base_seq(net); ext = __nft_set_do_lookup(net, set, key); if (ext) break; /* No match? There is a small chance that lookup was * performed in the old generation, but nf_tables_commit() * already unlinked a (matching) element. * * We need to repeat the lookup to make sure that we didn't * miss a matching element in the new generation. */ } while (nft_lookup_should_retry(net, base_seq)); return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct net *net = nft_net(pkt); const struct nft_set_ext *ext; bool found; ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]); found = !!ext ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; return; } } if (ext) { if (priv->dreg_set) nft_data_copy(&regs->data[priv->dreg], nft_set_ext_data(ext), set->dlen); nft_set_elem_update_expr(ext, regs, pkt); } } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_lookup *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u32 flags; int err; if (tb[NFTA_LOOKUP_SET] == NULL || tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], tb[NFTA_LOOKUP_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } if (tb[NFTA_LOOKUP_DREG] != NULL) { if (priv->invert) return -EINVAL; if (!(set->flags & NFT_SET_MAP)) return -EINVAL; err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG], &priv->dreg, NULL, nft_set_datatype(set), set->dlen); if (err < 0) return err; priv->dreg_set = true; } else if (set->flags & NFT_SET_MAP) { /* Map given, but user asks for lookup only (i.e. to * ignore value assoicated with key). * * This makes no sense for anonymous maps since they are * scoped to the rule, but for named sets this can be useful. */ if (set->flags & NFT_SET_ANONYMOUS) return -EINVAL; } priv->binding.flags = set->flags & NFT_SET_MAP; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) return err; priv->set = set; return 0; } static void nft_lookup_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_lookup_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_lookup_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); nf_tables_destroy_set(ctx, priv->set); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg)) goto nla_put_failure; if (priv->dreg_set) if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); struct nft_set_iter iter; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; iter.genmask = nft_genmask_next(ctx->net); iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nft_setelem_validate; priv->set->ops->walk(ctx, priv->set, &iter); if (!iter.err) iter.err = nft_set_catchall_validate(ctx, priv->set); if (iter.err < 0) return iter.err; return 0; } static bool nft_lookup_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); if (priv->set->flags & NFT_SET_MAP) nft_reg_track_cancel(track, priv->dreg, priv->set->dlen); return false; } static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, .activate = nft_lookup_activate, .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, .reduce = nft_lookup_reduce, }; struct nft_expr_type nft_lookup_type __read_mostly = { .name = "lookup", .ops = &nft_lookup_ops, .policy = nft_lookup_policy, .maxattr = NFTA_LOOKUP_MAX, .owner = THIS_MODULE, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 /* SPDX-License-Identifier: GPL-2.0 */ /* * Header for use in defining a given L4 protocol for connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalized L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h */ #ifndef _NF_CONNTRACK_L4PROTO_H #define _NF_CONNTRACK_L4PROTO_H #include <linux/netlink.h> #include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netns/generic.h> struct seq_file; struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; /* Resolve clashes on insertion races. */ bool allow_clash; /* protoinfo nlattr size, closes a hole */ u16 nlattr_size; /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); const struct nla_policy *nla_policy; struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); u16 obj_size; u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); #endif }; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr); int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO IPPROTO_UDPLITE const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...); __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...); #else static inline __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) {} static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { return &net->ct.nf_ct_proto.sctp; } #endif #ifdef CONFIG_NF_CT_PROTO_GRE static inline struct nf_gre_net *nf_gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } #endif #endif /*_NF_CONNTRACK_PROTOCOL_H*/
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include <linux/ptp_clock_kernel.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "../core/dev.h" #include "ts.h" struct tsconfig_req_info { struct ethnl_req_info base; }; struct tsconfig_reply_data { struct ethnl_reply_data base; struct hwtstamp_provider_desc hwprov_desc; struct { u32 tx_type; u32 rx_filter; u32 flags; } hwtst_config; }; #define TSCONFIG_REPDATA(__reply_base) \ container_of(__reply_base, struct tsconfig_reply_data, base) const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = { [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); struct hwtstamp_provider *hwprov = NULL; struct net_device *dev = reply_base->dev; struct kernel_hwtstamp_config cfg = {}; int ret; if (!dev->netdev_ops->ndo_hwtstamp_get) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev_get_hwtstamp_phylib(dev, &cfg); if (ret) goto out; data->hwtst_config.tx_type = BIT(cfg.tx_type); data->hwtst_config.rx_filter = BIT(cfg.rx_filter); data->hwtst_config.flags = cfg.flags; data->hwprov_desc.index = -1; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { data->hwprov_desc.index = hwprov->desc.index; data->hwprov_desc.qualifier = hwprov->desc.qualifier; } else { struct kernel_ethtool_ts_info ts_info = {}; ts_info.phc_index = -1; ret = __ethtool_get_ts_info(dev, &ts_info); if (ret) goto out; if (ts_info.phc_index == -1) return -ENODEV; data->hwprov_desc.index = ts_info.phc_index; data->hwprov_desc.qualifier = ts_info.phc_qualifier; } out: ethnl_ops_complete(dev); return ret; } static int tsconfig_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len = 0; int ret; BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (data->hwtst_config.flags) { ret = ethnl_bitset32_size(&data->hwtst_config.flags, NULL, __HWTSTAMP_FLAG_CNT, ts_flags_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ } if (data->hwtst_config.tx_type) { ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_TX_TYPES */ } if (data->hwtst_config.rx_filter) { ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_RX_FILTERS */ } if (data->hwprov_desc.index >= 0) /* _TSCONFIG_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); return len; } static int tsconfig_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; if (data->hwtst_config.flags) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, &data->hwtst_config.flags, NULL, __HWTSTAMP_FLAG_CNT, ts_flags_names, compact); if (ret < 0) return ret; } if (data->hwtst_config.tx_type) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES, &data->hwtst_config.tx_type, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (data->hwtst_config.rx_filter) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS, &data->hwtst_config.rx_filter, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (data->hwprov_desc.index >= 0) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, data->hwprov_desc.index) || nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, data->hwprov_desc.qualifier)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); } return 0; } /* TSCONFIG_SET */ const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = { [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, }; static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) { struct tsconfig_reply_data *reply_data; struct tsconfig_req_info *req_info; struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL); if (!reply_data) { kfree(req_info); return -ENOMEM; } ASSERT_RTNL(); reply_data->base.dev = dev; ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info); if (ret < 0) goto err_cleanup; ret = tsconfig_reply_size(&req_info->base, &reply_data->base); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); if (!rskb) goto err_cleanup; ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); if (ret < 0) goto err_cleanup; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); err_cleanup: kfree(reply_data); kfree(req_info); return ret; } static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base, struct genl_info *info) { const struct net_device_ops *ops = req_base->dev->netdev_ops; if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get) return -EOPNOTSUPP; return 1; } static struct hwtstamp_provider * tsconfig_set_hwprov_from_desc(struct net_device *dev, struct genl_info *info, struct hwtstamp_provider_desc *hwprov_desc) { struct kernel_ethtool_ts_info ts_info; struct hwtstamp_provider *hwprov; struct nlattr **tb = info->attrs; struct phy_device *phy = NULL; enum hwtstamp_source source; int ret; ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); if (!ret) { /* Found */ source = HWTSTAMP_SOURCE_NETDEV; } else { phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); if (IS_ERR(phy)) { if (PTR_ERR(phy) == -ENODEV) NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], "phc not in this net device topology"); return ERR_CAST(phy); } source = HWTSTAMP_SOURCE_PHYLIB; } hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL); if (!hwprov) return ERR_PTR(-ENOMEM); hwprov->desc.index = hwprov_desc->index; hwprov->desc.qualifier = hwprov_desc->qualifier; hwprov->source = source; hwprov->phydev = phy; return hwprov; } static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, struct genl_info *info) { struct kernel_hwtstamp_config hwtst_config = {0}; bool hwprov_mod = false, config_mod = false; struct hwtstamp_provider *hwprov = NULL; struct net_device *dev = req_base->dev; struct nlattr **tb = info->attrs; int ret; BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (!netif_device_present(dev)) return -ENODEV; if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) { struct hwtstamp_provider_desc __hwprov_desc = {.index = -1}; struct hwtstamp_provider *__hwprov; __hwprov = rtnl_dereference(dev->hwprov); if (__hwprov) { __hwprov_desc.index = __hwprov->desc.index; __hwprov_desc.qualifier = __hwprov->desc.qualifier; } ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], &__hwprov_desc, info->extack, &hwprov_mod); if (ret < 0) return ret; if (hwprov_mod) { hwprov = tsconfig_set_hwprov_from_desc(dev, info, &__hwprov_desc); if (IS_ERR(hwprov)) return PTR_ERR(hwprov); } } /* Get current hwtstamp config if we are not changing the * hwtstamp source. It will be zeroed in the other case. */ if (!hwprov_mod) { ret = dev_get_hwtstamp_phylib(dev, &hwtst_config); if (ret < 0 && ret != -EOPNOTSUPP) goto err_free_hwprov; } /* Get the hwtstamp config from netlink */ if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) { u32 req_tx_type; req_tx_type = BIT(hwtst_config.tx_type); ret = ethnl_update_bitset32(&req_tx_type, __HWTSTAMP_TX_CNT, tb[ETHTOOL_A_TSCONFIG_TX_TYPES], ts_tx_type_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; /* Select only one tx type at a time */ if (ffs(req_tx_type) != fls(req_tx_type)) { ret = -EINVAL; goto err_free_hwprov; } hwtst_config.tx_type = ffs(req_tx_type) - 1; } if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) { u32 req_rx_filter; req_rx_filter = BIT(hwtst_config.rx_filter); ret = ethnl_update_bitset32(&req_rx_filter, __HWTSTAMP_FILTER_CNT, tb[ETHTOOL_A_TSCONFIG_RX_FILTERS], ts_rx_filter_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; /* Select only one rx filter at a time */ if (ffs(req_rx_filter) != fls(req_rx_filter)) { ret = -EINVAL; goto err_free_hwprov; } hwtst_config.rx_filter = ffs(req_rx_filter) - 1; } if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { ret = ethnl_update_bitset32(&hwtst_config.flags, __HWTSTAMP_FLAG_CNT, tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], ts_flags_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; } ret = net_hwtstamp_validate(&hwtst_config); if (ret) goto err_free_hwprov; if (hwprov_mod) { struct kernel_hwtstamp_config zero_config = {0}; struct hwtstamp_provider *__hwprov; /* Disable current time stamping if we try to enable * another one */ ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack); if (ret < 0) goto err_free_hwprov; /* Change the selected hwtstamp source */ __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov); if (__hwprov) kfree_rcu(__hwprov, rcu_head); } if (config_mod) { ret = dev_set_hwtstamp_phylib(dev, &hwtst_config, info->extack); if (ret < 0) return ret; } ret = tsconfig_send_reply(dev, info); if (ret && ret != -EOPNOTSUPP) { NL_SET_ERR_MSG(info->extack, "error while reading the new configuration set"); return ret; } /* tsconfig has no notification */ return 0; err_free_hwprov: kfree(hwprov); return ret; } const struct ethnl_request_ops ethnl_tsconfig_request_ops = { .request_cmd = ETHTOOL_MSG_TSCONFIG_GET, .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY, .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER, .req_info_size = sizeof(struct tsconfig_req_info), .reply_data_size = sizeof(struct tsconfig_reply_data), .prepare_data = tsconfig_prepare_data, .reply_size = tsconfig_reply_size, .fill_reply = tsconfig_fill_reply, .set_validate = ethnl_set_tsconfig_validate, .set = ethnl_set_tsconfig, };
208 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) { if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ return 1; sprintf(strbuf, "%pM", addr->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } /* Convert discovery msg addr format to Ethernet media addr format */ static int tipc_eth_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { /* Skip past preamble: */ msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } /* Ethernet media registration info */ struct tipc_media eth_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_eth_addr2str, .addr2msg = tipc_eth_addr2msg, .msg2addr = tipc_eth_msg2addr, .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" };
21 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> */ #include "core_priv.h" /** * ib_device_register_rdmacg - register with rdma cgroup. * @device: device to register to participate in resource * accounting by rdma cgroup. * * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. */ void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; rdmacg_register_device(&device->cg_device); } /** * ib_device_unregister_rdmacg - unregister with rdma cgroup. * @device: device to unregister. * * Unregister with the rdma cgroup. Should be called after * all the resources are deallocated, and after a stage when any * other resource allocation by user application cannot be done * for this device to avoid any leak in accounting. */ void ib_device_unregister_rdmacg(struct ib_device *device) { rdmacg_unregister_device(&device->cg_device); } int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, resource_index); } EXPORT_SYMBOL(ib_rdmacg_try_charge); void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { rdmacg_uncharge(cg_obj->cg, &device->cg_device, resource_index); } EXPORT_SYMBOL(ib_rdmacg_uncharge);
3 1 1 8 3 3 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Round-Robin Scheduling module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * * Fixes/Changes: * Wensong Zhang : changed the ip_vs_rr_schedule to return dest * Julian Anastasov : fixed the NULL pointer access bug in debugging * Wensong Zhang : changed some comestics things for debugging * Wensong Zhang : changed for the d-linked destination list * Wensong Zhang : added the ip_vs_rr_update_svc * Wensong Zhang : added any dest with weight=0 is quiesced */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> static int ip_vs_rr_init_svc(struct ip_vs_service *svc) { svc->sched_data = &svc->destinations; return 0; } static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct list_head *p; spin_lock_bh(&svc->sched_lock); p = (struct list_head *) svc->sched_data; /* dest is already unlinked, so p->prev is not valid but * p->next is valid, use it to reach previous entry. */ if (p == &dest->n_list) svc->sched_data = p->next->prev; spin_unlock_bh(&svc->sched_lock); return 0; } /* * Round-Robin Scheduling */ static struct ip_vs_dest * ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct list_head *p; struct ip_vs_dest *dest, *last; int pass = 0; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); spin_lock_bh(&svc->sched_lock); p = (struct list_head *) svc->sched_data; last = dest = list_entry(p, struct ip_vs_dest, n_list); do { list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) /* HIT */ goto out; if (dest == last) goto stop; } pass++; /* Previous dest could be unlinked, do not loop forever. * If we stay at head there is no need for 2nd pass. */ } while (pass < 2 && p != &svc->destinations); stop: spin_unlock_bh(&svc->sched_lock); ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: svc->sched_data = &dest->n_list; spin_unlock_bh(&svc->sched_lock); IP_VS_DBG_BUF(6, "RR: server %s:%u " "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), refcount_read(&dest->refcnt), atomic_read(&dest->weight)); return dest; } static struct ip_vs_scheduler ip_vs_rr_scheduler = { .name = "rr", /* name */ .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), .init_service = ip_vs_rr_init_svc, .add_dest = NULL, .del_dest = ip_vs_rr_del_dest, .schedule = ip_vs_rr_schedule, }; static int __init ip_vs_rr_init(void) { return register_ip_vs_scheduler(&ip_vs_rr_scheduler); } static void __exit ip_vs_rr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); synchronize_rcu(); } module_init(ip_vs_rr_init); module_exit(ip_vs_rr_cleanup); MODULE_DESCRIPTION("ipvs round-robin scheduler"); MODULE_LICENSE("GPL");
194 194 194 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ #include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> #include <net/hotdata.h> #include <net/proto_memory.h> #include <net/rps.h> #include "dev.h" #include "net-sysfs.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); /* 0 - Keep current behavior: * IPv4: inherit all current settings from init_net * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, struct cpumask *mask) { char *kbuf; int len; if (*ppos || !*lenp) { *lenp = 0; return 0; } /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 * nibbles (except the last one which has a newline instead). * Guesstimate the buffer size at the group granularity level. */ len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); kbuf = kmalloc(len, GFP_KERNEL); if (!kbuf) { *lenp = 0; return -ENOMEM; } len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; goto free_buf; } /* scnprintf writes a trailing null char not counted in the returned * length, override it with a newline. */ kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; free_buf: kfree(kbuf); return 0; } #endif #ifdef CONFIG_RPS DEFINE_MUTEX(rps_default_mask_mutex); static int rps_default_mask_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->data; struct cpumask *mask; int err = 0; mutex_lock(&rps_default_mask_mutex); mask = net->core.rps_default_mask; if (write) { if (!mask) { mask = kzalloc(cpumask_size(), GFP_KERNEL); net->core.rps_default_mask = mask; } err = -ENOMEM; if (!mask) goto done; err = cpumask_parse(buffer, mask); if (err) goto done; err = rps_cpumask_housekeeping(mask); if (err) goto done; } else { err = dump_cpumask(buffer, lenp, ppos, mask ?: cpu_none_mask); } done: mutex_unlock(&rps_default_mask_mutex); return err; } static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); mutex_lock(&sock_flow_mutex); orig_sock_table = rcu_dereference_protected( net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (size) { if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; } size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); if (!sock_table) { mutex_unlock(&sock_flow_mutex); return -ENOMEM; } net_hotdata.rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; if (sock_table != orig_sock_table) { rcu_assign_pointer(net_hotdata.rps_sock_flow_table, sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); } if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); kvfree_rcu(orig_sock_table, rcu); } } } mutex_unlock(&sock_flow_mutex); return ret; } #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; cpumask_var_t mask; int i, len, ret = 0; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; if (write) { ret = cpumask_parse(buffer, mask); if (ret) goto done; mutex_lock(&flow_limit_update_mutex); len = sizeof(*cur) + netdev_flow_limit_table_len; for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); cur = rcu_dereference_protected(sd->flow_limit, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); if (rcu_dereference(sd->flow_limit)) cpumask_set_cpu(i, mask); } rcu_read_unlock(); ret = dump_cpumask(buffer, lenp, ppos, mask); } done: free_cpumask_var(mask); return ret; } static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; mutex_lock(&flow_limit_update_mutex); ptr = table->data; old = *ptr; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write && !is_power_of_2(*ptr)) { *ptr = old; ret = -EINVAL; } mutex_unlock(&flow_limit_update_mutex); return ret; } #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { .data = id, .maxlen = IFNAMSIZ, }; int ret; qdisc_get_default(id, IFNAMSIZ); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = qdisc_set_default(id); return ret; } #endif static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); } #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; int min = *(int *)table->extra1; int max = *(int *)table->extra2; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); } else { ret = -EPERM; } } if (write && ret && min == max) pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } # endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static struct ctl_table net_core_table[] = { { .procname = "mem_pcpu_rsv", .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT { .procname = "bpf_jit_harden", .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, # endif { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { .procname = "flow_limit_cpu_bitmap", .mode = 0644, .proc_handler = flow_limit_cpu_sysctl }, { .procname = "flow_limit_table_len", .data = &netdev_flow_limit_table_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED { .procname = "default_qdisc", .mode = 0644, .maxlen = IFNAMSIZ, .proc_handler = set_default_qdisc }, #endif { .procname = "netdev_budget", .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_skb_frags", .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { .procname = "netdev_budget_usecs", .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", .data = &sysctl_fb_tunnels_only_for_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "devconf_inherit_init_net", .data = &sysctl_devconf_inherit_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", .data = &net_high_order_alloc_disable_key.key, .maxlen = sizeof(net_high_order_alloc_disable_key), .mode = 0644, .proc_handler = proc_do_static_key, }, { .procname = "gro_normal_batch", .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_unregister_timeout_secs", .data = &netdev_unregister_timeout_secs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { .procname = "skb_defer_max", .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, }; static struct ctl_table netns_core_table[] = { #if IS_ENABLED(CONFIG_RPS) { .procname = "rps_default_mask", .data = &init_net, .mode = 0644, .proc_handler = rps_default_mask_sysctl }, #endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { .procname = "optmem_max", .data = &init_net.core.sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, { .procname = "txq_reselection_ms", .data = &init_net.core.sysctl_txq_reselection, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "tstamp_allow_data", .data = &init_net.core.sysctl_tstamp_allow_data, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "bypass_prot_mem", .data = &init_net.core.sysctl_bypass_prot_mem, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ { .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) { /* fallback tunnels for initns only */ if (!strncmp(str, "initns", 6)) sysctl_fb_tunnels_only_for_init_net = 1; /* no fallback tunnels anywhere */ else if (!strncmp(str, "none", 4)) sysctl_fb_tunnels_only_for_init_net = 2; return 1; } __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(netns_core_table); struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; for (i = 0; i < table_size; ++i) { if (tbl[i].data == &sysctl_wmem_max) break; tbl[i].data += (char *)net - (char *)&init_net; } for (; i < table_size; ++i) tbl[i].mode &= ~0222; } net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_core_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); #if IS_ENABLED(CONFIG_RPS) kfree(net->core.rps_default_mask); #endif kfree(tbl); } static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, }; static __init int sysctl_core_init(void) { register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } fs_initcall(sysctl_core_init);
6 2 6 13 13 13 13 13 13 13 13 13 13 13 13 13 3 13 13 6 7 7 5 5 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 2 1 1 1 1 2 2 2 1 2 1 1 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 // SPDX-License-Identifier: GPL-2.0-or-later /* A network driver using virtio. * * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ //#define DEBUG #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_net.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/scatterlist.h> #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/average.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/dim.h> #include <net/route.h> #include <net/xdp.h> #include <net/net_failover.h> #include <net/netdev_rx_queue.h> #include <net/netdev_queues.h> #include <net/xdp_sock_drv.h> static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); static bool csum = true, gso = true, napi_tx = true; module_param(csum, bool, 0444); module_param(gso, bool, 0444); module_param(napi_tx, bool, 0644); #define VIRTIO_OFFLOAD_MAP_MIN 46 #define VIRTIO_OFFLOAD_MAP_MAX 47 #define VIRTIO_FEATURES_MAP_MIN 65 #define VIRTIO_O2F_DELTA (VIRTIO_FEATURES_MAP_MIN - \ VIRTIO_OFFLOAD_MAP_MIN) static bool virtio_is_mapped_offload(unsigned int obit) { return obit >= VIRTIO_OFFLOAD_MAP_MIN && obit <= VIRTIO_OFFLOAD_MAP_MAX; } static unsigned int virtio_offload_to_feature(unsigned int obit) { return virtio_is_mapped_offload(obit) ? obit + VIRTIO_O2F_DELTA : obit; } /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* Separating two types of XDP xmit */ #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1) /* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled * at once, the weight is chosen so that the EWMA will be insensitive to short- * term, transient changes in packet size. */ DECLARE_EWMA(pkt_len, 0, 64) #define VIRTNET_DRIVER_VERSION "1.0.0" static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, VIRTIO_NET_F_GUEST_HDRLEN, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED, }; #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO) | \ (1ULL << VIRTIO_NET_F_GUEST_USO4) | \ (1ULL << VIRTIO_NET_F_GUEST_USO6) | \ (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED) | \ (1ULL << VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED)) struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; size_t qstat_offset; }; struct virtnet_sq_free_stats { u64 packets; u64 bytes; u64 napi_packets; u64 napi_bytes; u64 xsk; }; struct virtnet_sq_stats { struct u64_stats_sync syncp; u64_stats_t packets; u64_stats_t bytes; u64_stats_t xdp_tx; u64_stats_t xdp_tx_drops; u64_stats_t kicks; u64_stats_t tx_timeouts; u64_stats_t stop; u64_stats_t wake; }; struct virtnet_rq_stats { struct u64_stats_sync syncp; u64_stats_t packets; u64_stats_t bytes; u64_stats_t drops; u64_stats_t xdp_packets; u64_stats_t xdp_tx; u64_stats_t xdp_redirects; u64_stats_t xdp_drops; u64_stats_t kicks; }; #define VIRTNET_SQ_STAT(name, m) {name, offsetof(struct virtnet_sq_stats, m), -1} #define VIRTNET_RQ_STAT(name, m) {name, offsetof(struct virtnet_rq_stats, m), -1} #define VIRTNET_SQ_STAT_QSTAT(name, m) \ { \ name, \ offsetof(struct virtnet_sq_stats, m), \ offsetof(struct netdev_queue_stats_tx, m), \ } #define VIRTNET_RQ_STAT_QSTAT(name, m) \ { \ name, \ offsetof(struct virtnet_rq_stats, m), \ offsetof(struct netdev_queue_stats_rx, m), \ } static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = { VIRTNET_SQ_STAT("xdp_tx", xdp_tx), VIRTNET_SQ_STAT("xdp_tx_drops", xdp_tx_drops), VIRTNET_SQ_STAT("kicks", kicks), VIRTNET_SQ_STAT("tx_timeouts", tx_timeouts), }; static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { VIRTNET_RQ_STAT("drops", drops), VIRTNET_RQ_STAT("xdp_packets", xdp_packets), VIRTNET_RQ_STAT("xdp_tx", xdp_tx), VIRTNET_RQ_STAT("xdp_redirects", xdp_redirects), VIRTNET_RQ_STAT("xdp_drops", xdp_drops), VIRTNET_RQ_STAT("kicks", kicks), }; static const struct virtnet_stat_desc virtnet_sq_stats_desc_qstat[] = { VIRTNET_SQ_STAT_QSTAT("packets", packets), VIRTNET_SQ_STAT_QSTAT("bytes", bytes), VIRTNET_SQ_STAT_QSTAT("stop", stop), VIRTNET_SQ_STAT_QSTAT("wake", wake), }; static const struct virtnet_stat_desc virtnet_rq_stats_desc_qstat[] = { VIRTNET_RQ_STAT_QSTAT("packets", packets), VIRTNET_RQ_STAT_QSTAT("bytes", bytes), }; #define VIRTNET_STATS_DESC_CQ(name) \ {#name, offsetof(struct virtio_net_stats_cvq, name), -1} #define VIRTNET_STATS_DESC_RX(class, name) \ {#name, offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), -1} #define VIRTNET_STATS_DESC_TX(class, name) \ {#name, offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), -1} static const struct virtnet_stat_desc virtnet_stats_cvq_desc[] = { VIRTNET_STATS_DESC_CQ(command_num), VIRTNET_STATS_DESC_CQ(ok_num), }; static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc[] = { VIRTNET_STATS_DESC_RX(basic, packets), VIRTNET_STATS_DESC_RX(basic, bytes), VIRTNET_STATS_DESC_RX(basic, notifications), VIRTNET_STATS_DESC_RX(basic, interrupts), }; static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc[] = { VIRTNET_STATS_DESC_TX(basic, packets), VIRTNET_STATS_DESC_TX(basic, bytes), VIRTNET_STATS_DESC_TX(basic, notifications), VIRTNET_STATS_DESC_TX(basic, interrupts), }; static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc[] = { VIRTNET_STATS_DESC_RX(csum, needs_csum), }; static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc[] = { VIRTNET_STATS_DESC_TX(gso, gso_packets_noseg), VIRTNET_STATS_DESC_TX(gso, gso_bytes_noseg), }; static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc[] = { VIRTNET_STATS_DESC_RX(speed, ratelimit_bytes), }; static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc[] = { VIRTNET_STATS_DESC_TX(speed, ratelimit_bytes), }; #define VIRTNET_STATS_DESC_RX_QSTAT(class, name, qstat_field) \ { \ #name, \ offsetof(struct virtio_net_stats_rx_ ## class, rx_ ## name), \ offsetof(struct netdev_queue_stats_rx, qstat_field), \ } #define VIRTNET_STATS_DESC_TX_QSTAT(class, name, qstat_field) \ { \ #name, \ offsetof(struct virtio_net_stats_tx_ ## class, tx_ ## name), \ offsetof(struct netdev_queue_stats_tx, qstat_field), \ } static const struct virtnet_stat_desc virtnet_stats_rx_basic_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(basic, drops, hw_drops), VIRTNET_STATS_DESC_RX_QSTAT(basic, drop_overruns, hw_drop_overruns), }; static const struct virtnet_stat_desc virtnet_stats_tx_basic_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(basic, drops, hw_drops), VIRTNET_STATS_DESC_TX_QSTAT(basic, drop_malformed, hw_drop_errors), }; static const struct virtnet_stat_desc virtnet_stats_rx_csum_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_valid, csum_unnecessary), VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_none, csum_none), VIRTNET_STATS_DESC_RX_QSTAT(csum, csum_bad, csum_bad), }; static const struct virtnet_stat_desc virtnet_stats_tx_csum_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(csum, csum_none, csum_none), VIRTNET_STATS_DESC_TX_QSTAT(csum, needs_csum, needs_csum), }; static const struct virtnet_stat_desc virtnet_stats_rx_gso_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets, hw_gro_packets), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes, hw_gro_bytes), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_packets_coalesced, hw_gro_wire_packets), VIRTNET_STATS_DESC_RX_QSTAT(gso, gso_bytes_coalesced, hw_gro_wire_bytes), }; static const struct virtnet_stat_desc virtnet_stats_tx_gso_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_packets, hw_gso_packets), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_bytes, hw_gso_bytes), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments, hw_gso_wire_packets), VIRTNET_STATS_DESC_TX_QSTAT(gso, gso_segments_bytes, hw_gso_wire_bytes), }; static const struct virtnet_stat_desc virtnet_stats_rx_speed_desc_qstat[] = { VIRTNET_STATS_DESC_RX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), }; static const struct virtnet_stat_desc virtnet_stats_tx_speed_desc_qstat[] = { VIRTNET_STATS_DESC_TX_QSTAT(speed, ratelimit_packets, hw_drop_ratelimits), }; #define VIRTNET_Q_TYPE_RX 0 #define VIRTNET_Q_TYPE_TX 1 #define VIRTNET_Q_TYPE_CQ 2 struct virtnet_interrupt_coalesce { u32 max_packets; u32 max_usecs; }; /* The dma information of pages allocated at a time. */ struct virtnet_rq_dma { dma_addr_t addr; u32 ref; u16 len; u16 need_sync; }; /* Internal representation of a send virtqueue */ struct send_queue { /* Virtqueue associated with this send _queue */ struct virtqueue *vq; /* TX: fragments + linear part + virtio header */ struct scatterlist sg[MAX_SKB_FRAGS + 2]; /* Name of the send queue: output.$index */ char name[16]; struct virtnet_sq_stats stats; struct virtnet_interrupt_coalesce intr_coal; struct napi_struct napi; /* Record whether sq is in reset state. */ bool reset; struct xsk_buff_pool *xsk_pool; dma_addr_t xsk_hdr_dma_addr; }; /* Internal representation of a receive virtqueue */ struct receive_queue { /* Virtqueue associated with this receive_queue */ struct virtqueue *vq; struct napi_struct napi; struct bpf_prog __rcu *xdp_prog; struct virtnet_rq_stats stats; /* The number of rx notifications */ u16 calls; /* Is dynamic interrupt moderation enabled? */ bool dim_enabled; /* Used to protect dim_enabled and inter_coal */ struct mutex dim_lock; /* Dynamic Interrupt Moderation */ struct dim dim; u32 packets_in_napi; struct virtnet_interrupt_coalesce intr_coal; /* Chain pages by the private ptr. */ struct page *pages; /* Average packet length for mergeable receive buffers. */ struct ewma_pkt_len mrg_avg_pkt_len; /* Page frag for packet buffer allocation. */ struct page_frag alloc_frag; /* RX: fragments + linear part + virtio header */ struct scatterlist sg[MAX_SKB_FRAGS + 2]; /* Min single buffer size for mergeable buffers case. */ unsigned int min_buf_len; /* Name of this receive queue: input.$index */ char name[16]; struct xdp_rxq_info xdp_rxq; /* Record the last dma info to free after new pages is allocated. */ struct virtnet_rq_dma *last_dma; struct xsk_buff_pool *xsk_pool; /* xdp rxq used by xsk */ struct xdp_rxq_info xsk_rxq_info; struct xdp_buff **xsk_buffs; }; #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 /* Control VQ buffers: protected by the rtnl lock */ struct control_buf { struct virtio_net_ctrl_hdr hdr; virtio_net_ctrl_ack status; }; struct virtnet_info { struct virtio_device *vdev; struct virtqueue *cvq; struct net_device *dev; struct send_queue *sq; struct receive_queue *rq; unsigned int status; /* Max # of queue pairs supported by the device */ u16 max_queue_pairs; /* # of queue pairs currently used by the driver */ u16 curr_queue_pairs; /* # of XDP queue pairs currently used by the driver */ u16 xdp_queue_pairs; /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */ bool xdp_enabled; /* I like... big packets and I cannot lie! */ bool big_packets; /* number of sg entries allocated for big packets */ unsigned int big_packets_num_skbfrags; /* Host will merge rx buffers for big packets (shake it! shake it!) */ bool mergeable_rx_bufs; /* Host supports rss and/or hash report */ bool has_rss; bool has_rss_hash_report; u8 rss_key_size; u16 rss_indir_table_size; u32 rss_hash_types_supported; u32 rss_hash_types_saved; struct virtio_net_rss_config_hdr *rss_hdr; struct virtio_net_rss_config_trailer rss_trailer; u8 rss_hash_key_data[VIRTIO_NET_RSS_MAX_KEY_SIZE]; /* Has control virtqueue */ bool has_cvq; /* Lock to protect the control VQ */ struct mutex cvq_lock; /* Host can handle any s/g split between our header and packet data */ bool any_header_sg; /* Packet virtio header size */ u8 hdr_len; /* Work struct for delayed refilling if we run low on memory. */ struct delayed_work refill; /* UDP tunnel support */ bool tx_tnl; bool rx_tnl; bool rx_tnl_csum; /* Is delayed refill enabled? */ bool refill_enabled; /* The lock to synchronize the access to refill_enabled */ spinlock_t refill_lock; /* Work struct for config space updates */ struct work_struct config_work; /* Work struct for setting rx mode */ struct work_struct rx_mode_work; /* OK to queue work setting RX mode? */ bool rx_mode_work_enabled; /* Does the affinity hint is set for virtqueues? */ bool affinity_hint_set; /* CPU hotplug instances for online & dead */ struct hlist_node node; struct hlist_node node_dead; struct control_buf *ctrl; /* Ethtool settings */ u8 duplex; u32 speed; /* Is rx dynamic interrupt moderation enabled? */ bool rx_dim_enabled; /* Interrupt coalescing settings */ struct virtnet_interrupt_coalesce intr_coal_tx; struct virtnet_interrupt_coalesce intr_coal_rx; unsigned long guest_offloads; unsigned long guest_offloads_capable; /* failover when STANDBY feature enabled */ struct failover *failover; u64 device_stats_cap; }; struct padded_vnet_hdr { struct virtio_net_hdr_v1_hash hdr; /* * hdr is in a separate sg buffer, and data sg buffer shares same page * with this header sg. This padding makes next sg 16 byte aligned * after the header. */ char padding[12]; }; struct virtio_net_common_hdr { union { struct virtio_net_hdr hdr; struct virtio_net_hdr_mrg_rxbuf mrg_hdr; struct virtio_net_hdr_v1_hash hash_v1_hdr; struct virtio_net_hdr_v1_hash_tunnel tnl_hdr; }; }; static struct virtio_net_common_hdr xsk_hdr; static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq); static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats); static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *skb, u8 flags); static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct sk_buff *curr_skb, struct page *page, void *buf, int len, int truesize); static void virtnet_xsk_completed(struct send_queue *sq, int num); enum virtnet_xmit_type { VIRTNET_XMIT_TYPE_SKB, VIRTNET_XMIT_TYPE_SKB_ORPHAN, VIRTNET_XMIT_TYPE_XDP, VIRTNET_XMIT_TYPE_XSK, }; static size_t virtnet_rss_hdr_size(const struct virtnet_info *vi) { u16 indir_table_size = vi->has_rss ? vi->rss_indir_table_size : 1; return struct_size(vi->rss_hdr, indirection_table, indir_table_size); } static size_t virtnet_rss_trailer_size(const struct virtnet_info *vi) { return struct_size(&vi->rss_trailer, hash_key_data, vi->rss_key_size); } /* We use the last two bits of the pointer to distinguish the xmit type. */ #define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1)) #define VIRTIO_XSK_FLAG_OFFSET 2 static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr) { unsigned long p = (unsigned long)*ptr; *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK); return p & VIRTNET_XMIT_TYPE_MASK; } static void *virtnet_xmit_ptr_pack(void *ptr, enum virtnet_xmit_type type) { return (void *)((unsigned long)ptr | type); } static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data, enum virtnet_xmit_type type) { return virtqueue_add_outbuf(sq->vq, sq->sg, num, virtnet_xmit_ptr_pack(data, type), GFP_ATOMIC); } static u32 virtnet_ptr_to_xsk_buff_len(void *ptr) { return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET; } static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) { sg_dma_address(sg) = addr; sg_dma_len(sg) = len; } static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { struct xdp_frame *frame; struct sk_buff *skb; unsigned int len; void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { switch (virtnet_xmit_ptr_unpack(&ptr)) { case VIRTNET_XMIT_TYPE_SKB: skb = ptr; pr_debug("Sent skb %p\n", skb); stats->napi_packets++; stats->napi_bytes += skb->len; napi_consume_skb(skb, in_napi); break; case VIRTNET_XMIT_TYPE_SKB_ORPHAN: skb = ptr; stats->packets++; stats->bytes += skb->len; napi_consume_skb(skb, in_napi); break; case VIRTNET_XMIT_TYPE_XDP: frame = ptr; stats->packets++; stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); break; case VIRTNET_XMIT_TYPE_XSK: stats->bytes += virtnet_ptr_to_xsk_buff_len(ptr); stats->xsk++; break; } } netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); } static void virtnet_free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { __free_old_xmit(sq, txq, in_napi, stats); if (stats->xsk) virtnet_xsk_completed(sq, stats->xsk); } /* Converting between virtqueue no. and kernel tx/rx queue no. * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq */ static int vq2txq(struct virtqueue *vq) { return (vq->index - 1) / 2; } static int txq2vq(int txq) { return txq * 2 + 1; } static int vq2rxq(struct virtqueue *vq) { return vq->index / 2; } static int rxq2vq(int rxq) { return rxq * 2; } static int vq_type(struct virtnet_info *vi, int qid) { if (qid == vi->max_queue_pairs * 2) return VIRTNET_Q_TYPE_CQ; if (qid % 2) return VIRTNET_Q_TYPE_TX; return VIRTNET_Q_TYPE_RX; } static inline struct virtio_net_common_hdr * skb_vnet_common_hdr(struct sk_buff *skb) { return (struct virtio_net_common_hdr *)skb->cb; } /* * private is used to chain pages for big packets, put the whole * most recent used list in the beginning for reuse */ static void give_pages(struct receive_queue *rq, struct page *page) { struct page *end; /* Find end of list, sew whole thing into vi->rq.pages. */ for (end = page; end->private; end = (struct page *)end->private); end->private = (unsigned long)rq->pages; rq->pages = page; } static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) { struct page *p = rq->pages; if (p) { rq->pages = (struct page *)p->private; /* clear private here, it is used to chain pages */ p->private = 0; } else p = alloc_page(gfp_mask); return p; } static void virtnet_rq_free_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf) { if (vi->mergeable_rx_bufs) put_page(virt_to_head_page(buf)); else if (vi->big_packets) give_pages(rq, buf); else put_page(virt_to_head_page(buf)); } static void enable_delayed_refill(struct virtnet_info *vi) { spin_lock_bh(&vi->refill_lock); vi->refill_enabled = true; spin_unlock_bh(&vi->refill_lock); } static void disable_delayed_refill(struct virtnet_info *vi) { spin_lock_bh(&vi->refill_lock); vi->refill_enabled = false; spin_unlock_bh(&vi->refill_lock); } static void enable_rx_mode_work(struct virtnet_info *vi) { rtnl_lock(); vi->rx_mode_work_enabled = true; rtnl_unlock(); } static void disable_rx_mode_work(struct virtnet_info *vi) { rtnl_lock(); vi->rx_mode_work_enabled = false; rtnl_unlock(); } static void virtqueue_napi_schedule(struct napi_struct *napi, struct virtqueue *vq) { if (napi_schedule_prep(napi)) { virtqueue_disable_cb(vq); __napi_schedule(napi); } } static bool virtqueue_napi_complete(struct napi_struct *napi, struct virtqueue *vq, int processed) { int opaque; opaque = virtqueue_enable_cb_prepare(vq); if (napi_complete_done(napi, processed)) { if (unlikely(virtqueue_poll(vq, opaque))) virtqueue_napi_schedule(napi, vq); else return true; } else { virtqueue_disable_cb(vq); } return false; } static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; /* Suppress further interrupts. */ virtqueue_disable_cb(vq); if (napi->weight) virtqueue_napi_schedule(napi, vq); else /* We were probably waiting for more output buffers. */ netif_wake_subqueue(vi->dev, vq2txq(vq)); } #define MRG_CTX_HEADER_SHIFT 22 static void *mergeable_len_to_ctx(unsigned int truesize, unsigned int headroom) { return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize); } static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx) { return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT; } static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) { return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); } static int check_mergeable_len(struct net_device *dev, void *mrg_ctx, unsigned int len) { unsigned int headroom, tailroom, room, truesize; truesize = mergeable_ctx_to_truesize(mrg_ctx); headroom = mergeable_ctx_to_headroom(mrg_ctx); tailroom = headroom ? sizeof(struct skb_shared_info) : 0; room = SKB_DATA_ALIGN(headroom + tailroom); if (len > truesize - room) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)(truesize - room)); DEV_STATS_INC(dev, rx_length_errors); return -1; } return 0; } static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen, unsigned int headroom, unsigned int len) { struct sk_buff *skb; skb = build_skb(buf, buflen); if (unlikely(!skb)) return NULL; skb_reserve(skb, headroom); skb_put(skb, len); return skb; } /* Called from bottom half context */ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, unsigned int len, unsigned int truesize, unsigned int headroom) { struct sk_buff *skb; struct virtio_net_common_hdr *hdr; unsigned int copy, hdr_len, hdr_padded_len; struct page *page_to_free = NULL; int tailroom, shinfo_size; char *p, *hdr_p, *buf; p = page_address(page) + offset; hdr_p = p; hdr_len = vi->hdr_len; if (vi->mergeable_rx_bufs) hdr_padded_len = hdr_len; else hdr_padded_len = sizeof(struct padded_vnet_hdr); buf = p - headroom; len -= hdr_len; offset += hdr_padded_len; p += hdr_padded_len; tailroom = truesize - headroom - hdr_padded_len - len; shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) { skb = virtnet_build_skb(buf, truesize, p - buf, len); if (unlikely(!skb)) return NULL; page = (struct page *)page->private; if (page) give_pages(rq, page); goto ok; } /* copy small packet so we can reuse these pages for small data */ skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); if (unlikely(!skb)) return NULL; /* Copy all frame if it fits skb->head, otherwise * we let virtio_net_hdr_to_skb() and GRO pull headers as needed. */ if (len <= skb_tailroom(skb)) copy = len; else copy = ETH_HLEN; skb_put_data(skb, p, copy); len -= copy; offset += copy; if (vi->mergeable_rx_bufs) { if (len) skb_add_rx_frag(skb, 0, page, offset, len, truesize); else page_to_free = page; goto ok; } /* * Verify that we can indeed put this data into a skb. * This is here to handle cases when the device erroneously * tries to receive more than is possible. This is usually * the case of a broken device. */ if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { net_dbg_ratelimited("%s: too much data\n", skb->dev->name); dev_kfree_skb(skb); return NULL; } BUG_ON(offset >= PAGE_SIZE); while (len) { unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset, frag_size, truesize); len -= frag_size; page = (struct page *)page->private; offset = 0; } if (page) give_pages(rq, page); ok: hdr = skb_vnet_common_hdr(skb); memcpy(hdr, hdr_p, hdr_len); if (page_to_free) put_page(page_to_free); return skb; } static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) { struct virtnet_info *vi = rq->vq->vdev->priv; struct page *page = virt_to_head_page(buf); struct virtnet_rq_dma *dma; void *head; int offset; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(page); dma = head; --dma->ref; if (dma->need_sync && len) { offset = buf - (head + sizeof(*dma)); virtqueue_map_sync_single_range_for_cpu(rq->vq, dma->addr, offset, len, DMA_FROM_DEVICE); } if (dma->ref) return; virtqueue_unmap_single_attrs(rq->vq, dma->addr, dma->len, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); put_page(page); } static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx) { struct virtnet_info *vi = rq->vq->vdev->priv; void *buf; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); buf = virtqueue_get_buf_ctx(rq->vq, len, ctx); if (buf) virtnet_rq_unmap(rq, buf, *len); return buf; } static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) { struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; dma_addr_t addr; u32 offset; void *head; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(rq->alloc_frag.page); offset = buf - head; dma = head; addr = dma->addr - sizeof(*dma) + offset; sg_init_table(rq->sg, 1); sg_fill_dma(rq->sg, addr, len); } static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_dma *dma; void *buf, *head; dma_addr_t addr; BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs); head = page_address(alloc_frag->page); dma = head; /* new pages */ if (!alloc_frag->offset) { if (rq->last_dma) { /* Now, the new page is allocated, the last dma * will not be used. So the dma can be unmapped * if the ref is 0. */ virtnet_rq_unmap(rq, rq->last_dma, 0); rq->last_dma = NULL; } dma->len = alloc_frag->size - sizeof(*dma); addr = virtqueue_map_single_attrs(rq->vq, dma + 1, dma->len, DMA_FROM_DEVICE, 0); if (virtqueue_map_mapping_error(rq->vq, addr)) return NULL; dma->addr = addr; dma->need_sync = virtqueue_map_need_sync(rq->vq, addr); /* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference * will be freed after the pages are no longer used. */ get_page(alloc_frag->page); dma->ref = 1; alloc_frag->offset = sizeof(*dma); rq->last_dma = dma; } ++dma->ref; buf = head + alloc_frag->offset; get_page(alloc_frag->page); alloc_frag->offset += size; return buf; } static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; struct receive_queue *rq; int i = vq2rxq(vq); rq = &vi->rq[i]; if (rq->xsk_pool) { xsk_buff_free((struct xdp_buff *)buf); return; } if (!vi->big_packets || vi->mergeable_rx_bufs) virtnet_rq_unmap(rq, buf, 0); virtnet_rq_free_buf(vi, rq, buf); } static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi) { struct virtnet_sq_free_stats stats = {0}; virtnet_free_old_xmit(sq, txq, in_napi, &stats); /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ if (!stats.packets && !stats.napi_packets) return; u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes + stats.napi_bytes); u64_stats_add(&sq->stats.packets, stats.packets + stats.napi_packets); u64_stats_update_end(&sq->stats.syncp); } static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) { if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) return false; else if (q < vi->curr_queue_pairs) return true; else return false; } static bool tx_may_stop(struct virtnet_info *vi, struct net_device *dev, struct send_queue *sq) { int qnum; qnum = sq - vi->sq; /* If running out of space, stop queue to avoid getting packets that we * are then unable to transmit. * An alternative would be to force queuing layer to requeue the skb by * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be * returned in a normal path of operation: it means that driver is not * maintaining the TX queue stop/start state properly, and causes * the stack to do a non-trivial amount of useless work. * Since most packets only take 1 or 2 ring slots, stopping the queue * early means 16 slots are typically wasted. */ if (sq->vq->num_free < MAX_SKB_FRAGS + 2) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); netif_tx_stop_queue(txq); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.stop); u64_stats_update_end(&sq->stats.syncp); return true; } return false; } static void check_sq_full_and_disable(struct virtnet_info *vi, struct net_device *dev, struct send_queue *sq) { bool use_napi = sq->napi.weight; int qnum; qnum = sq - vi->sq; if (tx_may_stop(vi, dev, sq)) { struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ free_old_xmit(sq, txq, false); if (sq->vq->num_free >= MAX_SKB_FRAGS + 2) { netif_start_subqueue(dev, qnum); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); virtqueue_disable_cb(sq->vq); } } } } /* Note that @len is the length of received data without virtio header */ static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, struct receive_queue *rq, void *buf, u32 len, bool first_buf) { struct xdp_buff *xdp; u32 bufsize; xdp = (struct xdp_buff *)buf; /* In virtnet_add_recvbuf_xsk, we use part of XDP_PACKET_HEADROOM for * virtio header and ask the vhost to fill data from * hard_start + XDP_PACKET_HEADROOM - vi->hdr_len * The first buffer has virtio header so the remaining region for frame * data is * xsk_pool_get_rx_frame_size() * While other buffers than the first one do not have virtio header, so * the maximum frame data's length can be * xsk_pool_get_rx_frame_size() + vi->hdr_len */ bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool); if (!first_buf) bufsize += vi->hdr_len; if (unlikely(len > bufsize)) { pr_debug("%s: rx error: len %u exceeds truesize %u\n", vi->dev->name, len, bufsize); DEV_STATS_INC(vi->dev, rx_length_errors); xsk_buff_free(xdp); return NULL; } if (first_buf) { xsk_buff_set_size(xdp, len); } else { xdp_prepare_buff(xdp, xdp->data_hard_start, XDP_PACKET_HEADROOM - vi->hdr_len, len, 1); xdp->flags = 0; } xsk_buff_dma_sync_for_cpu(xdp); return xdp; } static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, struct xdp_buff *xdp) { unsigned int metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; unsigned int size; size = xdp->data_end - xdp->data_hard_start; skb = napi_alloc_skb(&rq->napi, size); if (unlikely(!skb)) { xsk_buff_free(xdp); return NULL; } skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); size = xdp->data_end - xdp->data_meta; memcpy(__skb_put(skb, size), xdp->data_meta, size); if (metasize) { __skb_pull(skb, metasize); skb_metadata_set(skb, metasize); } xsk_buff_free(xdp); return skb; } static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct bpf_prog *prog; u32 ret; ret = XDP_PASS; rcu_read_lock(); prog = rcu_dereference(rq->xdp_prog); if (prog) ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); rcu_read_unlock(); switch (ret) { case XDP_PASS: return xsk_construct_skb(rq, xdp); case XDP_TX: case XDP_REDIRECT: return NULL; default: /* drop packet */ xsk_buff_free(xdp); u64_stats_inc(&stats->drops); return NULL; } } static void xsk_drop_follow_bufs(struct net_device *dev, struct receive_queue *rq, u32 num_buf, struct virtnet_rq_stats *stats) { struct xdp_buff *xdp; u32 len; while (num_buf-- > 1) { xdp = virtqueue_get_buf(rq->vq, &len); if (unlikely(!xdp)) { pr_debug("%s: rx error: %d buffers missing\n", dev->name, num_buf); DEV_STATS_INC(dev, rx_length_errors); break; } u64_stats_add(&stats->bytes, len); xsk_buff_free(xdp); } } static int xsk_append_merge_buffer(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *head_skb, u32 num_buf, struct virtio_net_hdr_mrg_rxbuf *hdr, struct virtnet_rq_stats *stats) { struct sk_buff *curr_skb; struct xdp_buff *xdp; u32 len, truesize; struct page *page; void *buf; curr_skb = head_skb; while (--num_buf) { buf = virtqueue_get_buf(rq->vq, &len); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", vi->dev->name, num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(vi->dev, rx_length_errors); return -EINVAL; } u64_stats_add(&stats->bytes, len); xdp = buf_to_xdp(vi, rq, buf, len, false); if (!xdp) goto err; buf = napi_alloc_frag(len); if (!buf) { xsk_buff_free(xdp); goto err; } memcpy(buf, xdp->data, len); xsk_buff_free(xdp); page = virt_to_page(buf); truesize = len; curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, buf, len, truesize); if (!curr_skb) { put_page(page); goto err; } } return 0; err: xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats); return -EINVAL; } static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr; struct bpf_prog *prog; struct sk_buff *skb; u32 ret, num_buf; hdr = xdp->data - vi->hdr_len; num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); ret = XDP_PASS; rcu_read_lock(); prog = rcu_dereference(rq->xdp_prog); /* TODO: support multi buffer. */ if (prog && num_buf == 1) ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); rcu_read_unlock(); switch (ret) { case XDP_PASS: skb = xsk_construct_skb(rq, xdp); if (!skb) goto drop_bufs; if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) { dev_kfree_skb(skb); goto drop; } return skb; case XDP_TX: case XDP_REDIRECT: return NULL; default: /* drop packet */ xsk_buff_free(xdp); } drop_bufs: xsk_drop_follow_bufs(dev, rq, num_buf, stats); drop: u64_stats_inc(&stats->drops); return NULL; } static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf, u32 len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct net_device *dev = vi->dev; struct sk_buff *skb = NULL; struct xdp_buff *xdp; u8 flags; len -= vi->hdr_len; u64_stats_add(&stats->bytes, len); xdp = buf_to_xdp(vi, rq, buf, len, true); if (!xdp) return; if (unlikely(len < ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); DEV_STATS_INC(dev, rx_length_errors); xsk_buff_free(xdp); return; } flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; if (!vi->mergeable_rx_bufs) skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); else skb = virtnet_receive_xsk_merge(dev, vi, rq, xdp, xdp_xmit, stats); if (skb) virtnet_receive_done(vi, rq, skb, flags); } static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq, struct xsk_buff_pool *pool, gfp_t gfp) { struct xdp_buff **xsk_buffs; dma_addr_t addr; int err = 0; u32 len, i; int num; xsk_buffs = rq->xsk_buffs; num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free); if (!num) return -ENOMEM; len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len; for (i = 0; i < num; ++i) { /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space. * We assume XDP_PACKET_HEADROOM is larger than hdr->len. * (see function virtnet_xsk_pool_enable) */ addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len; sg_init_table(rq->sg, 1); sg_fill_dma(rq->sg, addr, len); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, xsk_buffs[i], NULL, gfp); if (err) goto err; } return num; err: for (; i < num; ++i) xsk_buff_free(xsk_buffs[i]); return err; } static void *virtnet_xsk_to_ptr(u32 len) { unsigned long p; p = len << VIRTIO_XSK_FLAG_OFFSET; return virtnet_xmit_ptr_pack((void *)p, VIRTNET_XMIT_TYPE_XSK); } static int virtnet_xsk_xmit_one(struct send_queue *sq, struct xsk_buff_pool *pool, struct xdp_desc *desc) { struct virtnet_info *vi; dma_addr_t addr; vi = sq->vq->vdev->priv; addr = xsk_buff_raw_get_dma(pool, desc->addr); xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len); sg_init_table(sq->sg, 2); sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len); sg_fill_dma(sq->sg + 1, addr, desc->len); return virtqueue_add_outbuf_premapped(sq->vq, sq->sg, 2, virtnet_xsk_to_ptr(desc->len), GFP_ATOMIC); } static int virtnet_xsk_xmit_batch(struct send_queue *sq, struct xsk_buff_pool *pool, unsigned int budget, u64 *kicks) { struct xdp_desc *descs = pool->tx_descs; bool kick = false; u32 nb_pkts, i; int err; budget = min_t(u32, budget, sq->vq->num_free); nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); if (!nb_pkts) return 0; for (i = 0; i < nb_pkts; i++) { err = virtnet_xsk_xmit_one(sq, pool, &descs[i]); if (unlikely(err)) { xsk_tx_completed(sq->xsk_pool, nb_pkts - i); break; } kick = true; } if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) (*kicks)++; return i; } static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool, int budget) { struct virtnet_info *vi = sq->vq->vdev->priv; struct virtnet_sq_free_stats stats = {}; struct net_device *dev = vi->dev; u64 kicks = 0; int sent; /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of * free_old_xmit(). */ __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats); if (stats.xsk) xsk_tx_completed(sq->xsk_pool, stats.xsk); sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks); if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) check_sq_full_and_disable(vi, vi->dev, sq); if (sent) { struct netdev_queue *txq; txq = netdev_get_tx_queue(vi->dev, sq - vi->sq); txq_trans_cond_update(txq); } u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.packets, stats.packets); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.kicks, kicks); u64_stats_add(&sq->stats.xdp_tx, sent); u64_stats_update_end(&sq->stats.syncp); if (xsk_uses_need_wakeup(pool)) xsk_set_tx_need_wakeup(pool); return sent; } static void xsk_wakeup(struct send_queue *sq) { if (napi_if_scheduled_mark_missed(&sq->napi)) return; local_bh_disable(); virtqueue_napi_schedule(&sq->napi, sq->vq); local_bh_enable(); } static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) { struct virtnet_info *vi = netdev_priv(dev); struct send_queue *sq; if (!netif_running(dev)) return -ENETDOWN; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; xsk_wakeup(sq); return 0; } static void virtnet_xsk_completed(struct send_queue *sq, int num) { xsk_tx_completed(sq->xsk_pool, num); /* If this is called by rx poll, start_xmit and xdp xmit we should * wakeup the tx napi to consume the xsk tx queue, because the tx * interrupt may not be triggered. */ xsk_wakeup(sq); } static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct send_queue *sq, struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; struct skb_shared_info *shinfo; u8 nr_frags = 0; int err, i; if (unlikely(xdpf->headroom < vi->hdr_len)) return -EOVERFLOW; if (unlikely(xdp_frame_has_frags(xdpf))) { shinfo = xdp_get_shared_info_from_frame(xdpf); nr_frags = shinfo->nr_frags; } /* In wrapping function virtnet_xdp_xmit(), we need to free * up the pending old buffers, where we need to calculate the * position of skb_shared_info in xdp_get_frame_len() and * xdp_return_frame(), which will involve to xdpf->data and * xdpf->headroom. Therefore, we need to update the value of * headroom synchronously here. */ xdpf->headroom -= vi->hdr_len; xdpf->data -= vi->hdr_len; /* Zero header and leave csum up to XDP layers */ hdr = xdpf->data; memset(hdr, 0, vi->hdr_len); xdpf->len += vi->hdr_len; sg_init_table(sq->sg, nr_frags + 1); sg_set_buf(sq->sg, xdpf->data, xdpf->len); for (i = 0; i < nr_frags; i++) { skb_frag_t *frag = &shinfo->frags[i]; sg_set_page(&sq->sg[i + 1], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); } err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP); if (unlikely(err)) return -ENOSPC; /* Caller handle free/refcnt */ return 0; } /* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on * the current cpu, so it does not need to be locked. * * Here we use marco instead of inline functions because we have to deal with * three issues at the same time: 1. the choice of sq. 2. judge and execute the * lock/unlock of txq 3. make sparse happy. It is difficult for two inline * functions to perfectly solve these three problems at the same time. */ #define virtnet_xdp_get_sq(vi) ({ \ int cpu = smp_processor_id(); \ struct netdev_queue *txq; \ typeof(vi) v = (vi); \ unsigned int qp; \ \ if (v->curr_queue_pairs > nr_cpu_ids) { \ qp = v->curr_queue_pairs - v->xdp_queue_pairs; \ qp += cpu; \ txq = netdev_get_tx_queue(v->dev, qp); \ __netif_tx_acquire(txq); \ } else { \ qp = cpu % v->curr_queue_pairs; \ txq = netdev_get_tx_queue(v->dev, qp); \ __netif_tx_lock(txq, cpu); \ } \ v->sq + qp; \ }) #define virtnet_xdp_put_sq(vi, q) { \ struct netdev_queue *txq; \ typeof(vi) v = (vi); \ \ txq = netdev_get_tx_queue(v->dev, (q) - v->sq); \ if (v->curr_queue_pairs > nr_cpu_ids) \ __netif_tx_release(txq); \ else \ __netif_tx_unlock(txq); \ } static int virtnet_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_sq_free_stats stats = {0}; struct receive_queue *rq = vi->rq; struct bpf_prog *xdp_prog; struct send_queue *sq; int nxmit = 0; int kicks = 0; int ret; int i; /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated. */ xdp_prog = rcu_access_pointer(rq->xdp_prog); if (!xdp_prog) return -ENXIO; sq = virtnet_xdp_get_sq(vi); if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { ret = -EINVAL; goto out; } /* Free up any pending old buffers before queueing new ones. */ virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), false, &stats); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; if (__virtnet_xdp_xmit_one(vi, sq, xdpf)) break; nxmit++; } ret = nxmit; if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) check_sq_full_and_disable(vi, dev, sq); if (flags & XDP_XMIT_FLUSH) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) kicks = 1; } out: u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.packets, stats.packets); u64_stats_add(&sq->stats.xdp_tx, n); u64_stats_add(&sq->stats.xdp_tx_drops, n - nxmit); u64_stats_add(&sq->stats.kicks, kicks); u64_stats_update_end(&sq->stats.syncp); virtnet_xdp_put_sq(vi, sq); return ret; } static void put_xdp_frags(struct xdp_buff *xdp) { struct skb_shared_info *shinfo; struct page *xdp_page; int i; if (xdp_buff_has_frags(xdp)) { shinfo = xdp_get_shared_info_from_buff(xdp); for (i = 0; i < shinfo->nr_frags; i++) { xdp_page = skb_frag_page(&shinfo->frags[i]); put_page(xdp_page); } } } static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct xdp_frame *xdpf; int err; u32 act; act = bpf_prog_run_xdp(xdp_prog, xdp); u64_stats_inc(&stats->xdp_packets); switch (act) { case XDP_PASS: return act; case XDP_TX: u64_stats_inc(&stats->xdp_tx); xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) { netdev_dbg(dev, "convert buff to frame failed for xdp\n"); return XDP_DROP; } err = virtnet_xdp_xmit(dev, 1, &xdpf, 0); if (unlikely(!err)) { xdp_return_frame_rx_napi(xdpf); } else if (unlikely(err < 0)) { trace_xdp_exception(dev, xdp_prog, act); return XDP_DROP; } *xdp_xmit |= VIRTIO_XDP_TX; return act; case XDP_REDIRECT: u64_stats_inc(&stats->xdp_redirects); err = xdp_do_redirect(dev, xdp, xdp_prog); if (err) return XDP_DROP; *xdp_xmit |= VIRTIO_XDP_REDIR; return act; default: bpf_warn_invalid_xdp_action(dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); fallthrough; case XDP_DROP: return XDP_DROP; } } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { return vi->xdp_enabled ? XDP_PACKET_HEADROOM : 0; } /* We copy the packet for XDP in the following cases: * * 1) Packet is scattered across multiple rx buffers. * 2) Headroom space is insufficient. * * This is inefficient but it's a temporary condition that * we hit right after XDP is enabled and until queue is refilled * with large buffers with sufficient headroom - so it should affect * at most queue size packets. * Afterwards, the conditions to enable * XDP should preclude the underlying device from sending packets * across multiple buffers (num_buf > 1), and we make sure buffers * have enough headroom. */ static struct page *xdp_linearize_page(struct net_device *dev, struct receive_queue *rq, int *num_buf, struct page *p, int offset, int page_off, unsigned int *len) { int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page; if (page_off + *len + tailroom > PAGE_SIZE) return NULL; page = alloc_page(GFP_ATOMIC); if (!page) return NULL; memcpy(page_address(page) + page_off, page_address(p) + offset, *len); page_off += *len; /* Only mergeable mode can go inside this while loop. In small mode, * *num_buf == 1, so it cannot go inside. */ while (--*num_buf) { unsigned int buflen; void *buf; void *ctx; int off; buf = virtnet_rq_get_buf(rq, &buflen, &ctx); if (unlikely(!buf)) goto err_buf; p = virt_to_head_page(buf); off = buf - page_address(p); if (check_mergeable_len(dev, ctx, buflen)) { put_page(p); goto err_buf; } /* guard against a misconfigured or uncooperative backend that * is sending packet larger than the MTU. */ if ((page_off + buflen + tailroom) > PAGE_SIZE) { put_page(p); goto err_buf; } memcpy(page_address(page) + page_off, page_address(p) + off, buflen); page_off += buflen; put_page(p); } /* Headroom does not contribute to packet length */ *len = page_off - XDP_PACKET_HEADROOM; return page; err_buf: __free_pages(page, 0); return NULL; } static struct sk_buff *receive_small_build_skb(struct virtnet_info *vi, unsigned int xdp_headroom, void *buf, unsigned int len) { unsigned int header_offset; unsigned int headroom; unsigned int buflen; struct sk_buff *skb; header_offset = VIRTNET_RX_PAD + xdp_headroom; headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); skb = virtnet_build_skb(buf, buflen, headroom, len); if (unlikely(!skb)) return NULL; buf += header_offset; memcpy(skb_vnet_common_hdr(skb), buf, vi->hdr_len); return skb; } static struct sk_buff *receive_small_xdp(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *buf, unsigned int xdp_headroom, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; unsigned int headroom = vi->hdr_len + header_offset; struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; struct page *page = virt_to_head_page(buf); struct page *xdp_page; unsigned int buflen; struct xdp_buff xdp; struct sk_buff *skb; unsigned int metasize = 0; u32 act; if (unlikely(hdr->hdr.gso_type)) goto err_xdp; /* Partially checksummed packets must be dropped. */ if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) goto err_xdp; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { int offset = buf - page_address(page) + header_offset; unsigned int tlen = len + vi->hdr_len; int num_buf = 1; xdp_headroom = virtnet_get_headroom(vi); header_offset = VIRTNET_RX_PAD + xdp_headroom; headroom = vi->hdr_len + header_offset; buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); xdp_page = xdp_linearize_page(dev, rq, &num_buf, page, offset, header_offset, &tlen); if (!xdp_page) goto err_xdp; buf = page_address(xdp_page); put_page(page); page = xdp_page; } xdp_init_buff(&xdp, buflen, &rq->xdp_rxq); xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len, xdp_headroom, len, true); act = virtnet_xdp_handler(xdp_prog, &xdp, dev, xdp_xmit, stats); switch (act) { case XDP_PASS: /* Recalculate length in case bpf program changed it */ len = xdp.data_end - xdp.data; metasize = xdp.data - xdp.data_meta; break; case XDP_TX: case XDP_REDIRECT: goto xdp_xmit; default: goto err_xdp; } skb = virtnet_build_skb(buf, buflen, xdp.data - buf, len); if (unlikely(!skb)) goto err; if (metasize) skb_metadata_set(skb, metasize); return skb; err_xdp: u64_stats_inc(&stats->xdp_drops); err: u64_stats_inc(&stats->drops); put_page(page); xdp_xmit: return NULL; } static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int xdp_headroom = (unsigned long)ctx; struct page *page = virt_to_head_page(buf); struct sk_buff *skb; /* We passed the address of virtnet header to virtio-core, * so truncate the padding. */ buf -= VIRTNET_RX_PAD + xdp_headroom; len -= vi->hdr_len; u64_stats_add(&stats->bytes, len); if (unlikely(len > GOOD_PACKET_LEN)) { pr_debug("%s: rx error: len %u exceeds max size %d\n", dev->name, len, GOOD_PACKET_LEN); DEV_STATS_INC(dev, rx_length_errors); goto err; } if (unlikely(vi->xdp_enabled)) { struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { skb = receive_small_xdp(dev, vi, rq, xdp_prog, buf, xdp_headroom, len, xdp_xmit, stats); rcu_read_unlock(); return skb; } rcu_read_unlock(); } skb = receive_small_build_skb(vi, xdp_headroom, buf, len); if (likely(skb)) return skb; err: u64_stats_inc(&stats->drops); put_page(page); return NULL; } static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, unsigned int len, struct virtnet_rq_stats *stats) { struct page *page = buf; struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(!skb)) goto err; return skb; err: u64_stats_inc(&stats->drops); give_pages(rq, page); return NULL; } static void mergeable_buf_free(struct receive_queue *rq, int num_buf, struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page; void *buf; int len; while (num_buf-- > 1) { buf = virtnet_rq_get_buf(rq, &len, NULL); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers missing\n", dev->name, num_buf); DEV_STATS_INC(dev, rx_length_errors); break; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); put_page(page); } } /* Why not use xdp_build_skb_from_frame() ? * XDP core assumes that xdp frags are PAGE_SIZE in length, while in * virtio-net there are 2 points that do not match its requirements: * 1. The size of the prefilled buffer is not fixed before xdp is set. * 2. xdp_build_skb_from_frame() does more checks that we don't need, * like eth_type_trans() (which virtio-net does in receive_buf()). */ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, struct virtnet_info *vi, struct xdp_buff *xdp, unsigned int xdp_frags_truesz) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); unsigned int headroom, data_len; struct sk_buff *skb; int metasize; u8 nr_frags; if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { pr_debug("Error building skb as missing reserved tailroom for xdp"); return NULL; } if (unlikely(xdp_buff_has_frags(xdp))) nr_frags = sinfo->nr_frags; skb = build_skb(xdp->data_hard_start, xdp->frame_sz); if (unlikely(!skb)) return NULL; headroom = xdp->data - xdp->data_hard_start; data_len = xdp->data_end - xdp->data; skb_reserve(skb, headroom); __skb_put(skb, data_len); metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (metasize) skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, xdp_frags_truesz, xdp_buff_get_skb_flags(xdp)); return skb; } /* TODO: build xdp in big mode */ static int virtnet_build_xdp_buff_mrg(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct xdp_buff *xdp, void *buf, unsigned int len, unsigned int frame_sz, int *num_buf, unsigned int *xdp_frags_truesize, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; struct skb_shared_info *shinfo; unsigned int xdp_frags_truesz = 0; unsigned int truesize; struct page *page; skb_frag_t *frag; int offset; void *ctx; xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); xdp_prepare_buff(xdp, buf - XDP_PACKET_HEADROOM, XDP_PACKET_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); if (!*num_buf) return 0; if (*num_buf > 1) { /* If we want to build multi-buffer xdp, we need * to specify that the flags of xdp_buff have the * XDP_FLAGS_HAS_FRAG bit. */ if (!xdp_buff_has_frags(xdp)) xdp_buff_set_frags_flag(xdp); shinfo = xdp_get_shared_info_from_buff(xdp); shinfo->nr_frags = 0; shinfo->xdp_frags_size = 0; } if (*num_buf > MAX_SKB_FRAGS + 1) return -EINVAL; while (--*num_buf > 0) { buf = virtnet_rq_get_buf(rq, &len, &ctx); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", dev->name, *num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(dev, rx_length_errors); goto err; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); offset = buf - page_address(page); if (check_mergeable_len(dev, ctx, len)) { put_page(page); goto err; } truesize = mergeable_ctx_to_truesize(ctx); xdp_frags_truesz += truesize; frag = &shinfo->frags[shinfo->nr_frags++]; skb_frag_fill_page_desc(frag, page, offset, len); if (page_is_pfmemalloc(page)) xdp_buff_set_frag_pfmemalloc(xdp); shinfo->xdp_frags_size += len; } *xdp_frags_truesize = xdp_frags_truesz; return 0; err: put_xdp_frags(xdp); return -EINVAL; } static void *mergeable_xdp_get_buf(struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *ctx, unsigned int *frame_sz, int *num_buf, struct page **page, int offset, unsigned int *len, struct virtio_net_hdr_mrg_rxbuf *hdr) { unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); struct page *xdp_page; unsigned int xdp_room; /* Transient failure which in theory could occur if * in-flight packets from before XDP was enabled reach * the receive path after XDP is loaded. */ if (unlikely(hdr->hdr.gso_type)) return NULL; /* Partially checksummed packets must be dropped. */ if (unlikely(hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return NULL; /* Now XDP core assumes frag size is PAGE_SIZE, but buffers * with headroom may add hole in truesize, which * make their length exceed PAGE_SIZE. So we disabled the * hole mechanism for xdp. See add_recvbuf_mergeable(). */ *frame_sz = truesize; if (likely(headroom >= virtnet_get_headroom(vi) && (*num_buf == 1 || xdp_prog->aux->xdp_has_frags))) { return page_address(*page) + offset; } /* This happens when headroom is not enough because * of the buffer was prefilled before XDP is set. * This should only happen for the first several packets. * In fact, vq reset can be used here to help us clean up * the prefilled buffers, but many existing devices do not * support it, and we don't want to bother users who are * using xdp normally. */ if (!xdp_prog->aux->xdp_has_frags) { /* linearize data for XDP */ xdp_page = xdp_linearize_page(vi->dev, rq, num_buf, *page, offset, XDP_PACKET_HEADROOM, len); if (!xdp_page) return NULL; } else { xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); if (*len + xdp_room > PAGE_SIZE) return NULL; xdp_page = alloc_page(GFP_ATOMIC); if (!xdp_page) return NULL; memcpy(page_address(xdp_page) + XDP_PACKET_HEADROOM, page_address(*page) + offset, *len); } *frame_sz = PAGE_SIZE; put_page(*page); *page = xdp_page; return page_address(*page) + XDP_PACKET_HEADROOM; } static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, struct bpf_prog *xdp_prog, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); unsigned int xdp_frags_truesz = 0; struct sk_buff *head_skb; unsigned int frame_sz; struct xdp_buff xdp; void *data; u32 act; int err; data = mergeable_xdp_get_buf(vi, rq, xdp_prog, ctx, &frame_sz, &num_buf, &page, offset, &len, hdr); if (unlikely(!data)) goto err_xdp; err = virtnet_build_xdp_buff_mrg(dev, vi, rq, &xdp, data, len, frame_sz, &num_buf, &xdp_frags_truesz, stats); if (unlikely(err)) goto err_xdp; act = virtnet_xdp_handler(xdp_prog, &xdp, dev, xdp_xmit, stats); switch (act) { case XDP_PASS: head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz); if (unlikely(!head_skb)) break; return head_skb; case XDP_TX: case XDP_REDIRECT: return NULL; default: break; } put_xdp_frags(&xdp); err_xdp: put_page(page); mergeable_buf_free(rq, num_buf, dev, stats); u64_stats_inc(&stats->xdp_drops); u64_stats_inc(&stats->drops); return NULL; } static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct sk_buff *curr_skb, struct page *page, void *buf, int len, int truesize) { int num_skb_frags; int offset; num_skb_frags = skb_shinfo(curr_skb)->nr_frags; if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); if (unlikely(!nskb)) return NULL; if (curr_skb == head_skb) skb_shinfo(curr_skb)->frag_list = nskb; else curr_skb->next = nskb; curr_skb = nskb; head_skb->truesize += nskb->truesize; num_skb_frags = 0; } if (curr_skb != head_skb) { head_skb->data_len += len; head_skb->len += len; head_skb->truesize += truesize; } offset = buf - page_address(page); if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { put_page(page); skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, len, truesize); } else { skb_add_rx_frag(curr_skb, num_skb_frags, page, offset, len, truesize); } return curr_skb; } static struct sk_buff *receive_mergeable(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, unsigned int len, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); head_skb = NULL; u64_stats_add(&stats->bytes, len - vi->hdr_len); if (check_mergeable_len(dev, ctx, len)) goto err_skb; if (unlikely(vi->xdp_enabled)) { struct bpf_prog *xdp_prog; rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (xdp_prog) { head_skb = receive_mergeable_xdp(dev, vi, rq, xdp_prog, buf, ctx, len, xdp_xmit, stats); rcu_read_unlock(); return head_skb; } rcu_read_unlock(); } head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom); curr_skb = head_skb; if (unlikely(!curr_skb)) goto err_skb; while (--num_buf) { buf = virtnet_rq_get_buf(rq, &len, &ctx); if (unlikely(!buf)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", dev->name, num_buf, virtio16_to_cpu(vi->vdev, hdr->num_buffers)); DEV_STATS_INC(dev, rx_length_errors); goto err_buf; } u64_stats_add(&stats->bytes, len); page = virt_to_head_page(buf); if (check_mergeable_len(dev, ctx, len)) goto err_skb; truesize = mergeable_ctx_to_truesize(ctx); curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, buf, len, truesize); if (!curr_skb) goto err_skb; } ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); return head_skb; err_skb: put_page(page); mergeable_buf_free(rq, num_buf, dev, stats); err_buf: u64_stats_inc(&stats->drops); dev_kfree_skb(head_skb); return NULL; } static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { enum pkt_hash_types rss_hash_type; if (!hdr_hash || !skb) return; switch (__le16_to_cpu(hdr_hash->hash_report)) { case VIRTIO_NET_HASH_REPORT_TCPv4: case VIRTIO_NET_HASH_REPORT_UDPv4: case VIRTIO_NET_HASH_REPORT_TCPv6: case VIRTIO_NET_HASH_REPORT_UDPv6: case VIRTIO_NET_HASH_REPORT_TCPv6_EX: case VIRTIO_NET_HASH_REPORT_UDPv6_EX: rss_hash_type = PKT_HASH_TYPE_L4; break; case VIRTIO_NET_HASH_REPORT_IPv4: case VIRTIO_NET_HASH_REPORT_IPv6: case VIRTIO_NET_HASH_REPORT_IPv6_EX: rss_hash_type = PKT_HASH_TYPE_L3; break; case VIRTIO_NET_HASH_REPORT_NONE: default: rss_hash_type = PKT_HASH_TYPE_NONE; } skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, struct sk_buff *skb, u8 flags) { struct virtio_net_common_hdr *hdr; struct net_device *dev = vi->dev; hdr = skb_vnet_common_hdr(skb); if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); hdr->hdr.flags = flags; if (virtio_net_handle_csum_offload(skb, &hdr->hdr, vi->rx_tnl_csum)) { net_warn_ratelimited("%s: bad csum: flags: %x, gso_type: %x rx_tnl_csum %d\n", dev->name, hdr->hdr.flags, hdr->hdr.gso_type, vi->rx_tnl_csum); goto frame_err; } if (virtio_net_hdr_tnl_to_skb(skb, &hdr->tnl_hdr, vi->rx_tnl, vi->rx_tnl_csum, virtio_is_little_endian(vi->vdev))) { net_warn_ratelimited("%s: bad gso: type: %x, size: %u, flags %x tunnel %d tnl csum %d\n", dev->name, hdr->hdr.gso_type, hdr->hdr.gso_size, hdr->hdr.flags, vi->rx_tnl, vi->rx_tnl_csum); goto frame_err; } skb_record_rx_queue(skb, vq2rxq(rq->vq)); skb->protocol = eth_type_trans(skb, dev); pr_debug("Receiving skb proto 0x%04x len %i type %i\n", ntohs(skb->protocol), skb->len, skb->pkt_type); napi_gro_receive(&rq->napi, skb); return; frame_err: DEV_STATS_INC(dev, rx_frame_errors); dev_kfree_skb(skb); } static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, void *buf, unsigned int len, void **ctx, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { struct net_device *dev = vi->dev; struct sk_buff *skb; u8 flags; if (unlikely(len < vi->hdr_len + ETH_HLEN)) { pr_debug("%s: short packet %i\n", dev->name, len); DEV_STATS_INC(dev, rx_length_errors); virtnet_rq_free_buf(vi, rq, buf); return; } /* 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. */ flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; if (vi->mergeable_rx_bufs) skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len, stats); else skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); if (unlikely(!skb)) return; virtnet_receive_done(vi, rq, skb, flags); } /* Unlike mergeable buffers, all buffers are allocated to the * same size, except for the headroom. For this reason we do * not need to use mergeable_len_to_ctx here - it is enough * to store the headroom as the context ignoring the truesize. */ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { char *buf; unsigned int xdp_headroom = virtnet_get_headroom(vi); void *ctx = (void *)(unsigned long)xdp_headroom; int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; int err; len = SKB_DATA_ALIGN(len) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (unlikely(!skb_page_frag_refill(len, &rq->alloc_frag, gfp))) return -ENOMEM; buf = virtnet_rq_alloc(rq, len, gfp); if (unlikely(!buf)) return -ENOMEM; buf += VIRTNET_RX_PAD + xdp_headroom; virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } return err; } static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { struct page *first, *list = NULL; char *p; int i, err, offset; sg_init_table(rq->sg, vi->big_packets_num_skbfrags + 2); /* page in rq->sg[vi->big_packets_num_skbfrags + 1] is list tail */ for (i = vi->big_packets_num_skbfrags + 1; i > 1; --i) { first = get_a_page(rq, gfp); if (!first) { if (list) give_pages(rq, list); return -ENOMEM; } sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE); /* chain new page in list head to match sg */ first->private = (unsigned long)list; list = first; } first = get_a_page(rq, gfp); if (!first) { give_pages(rq, list); return -ENOMEM; } p = page_address(first); /* rq->sg[0], rq->sg[1] share the same page */ /* a separated rq->sg[0] for header - required in case !any_header_sg */ sg_set_buf(&rq->sg[0], p, vi->hdr_len); /* rq->sg[1] for data packet, from offset */ offset = sizeof(struct padded_vnet_hdr); sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset); /* chain first in list head */ first->private = (unsigned long)list; err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2, first, gfp); if (err < 0) give_pages(rq, first); return err; } static unsigned int get_mergeable_buf_len(struct receive_queue *rq, struct ewma_pkt_len *avg_pkt_len, unsigned int room) { struct virtnet_info *vi = rq->vq->vdev->priv; const size_t hdr_len = vi->hdr_len; unsigned int len; if (room) return PAGE_SIZE - room; len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), rq->min_buf_len, PAGE_SIZE - hdr_len); return ALIGN(len, L1_CACHE_BYTES); } static int add_recvbuf_mergeable(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { struct page_frag *alloc_frag = &rq->alloc_frag; unsigned int headroom = virtnet_get_headroom(vi); unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); unsigned int len, hole; void *ctx; char *buf; int err; /* Extra tailroom is needed to satisfy XDP's assumption. This * means rx frags coalescing won't work, but consider we've * disabled GSO for XDP, it won't be a big issue. */ len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) return -ENOMEM; if (!alloc_frag->offset && len + room + sizeof(struct virtnet_rq_dma) > alloc_frag->size) len -= sizeof(struct virtnet_rq_dma); buf = virtnet_rq_alloc(rq, len + room, gfp); if (unlikely(!buf)) return -ENOMEM; buf += headroom; /* advance address leaving hole at front of pkt */ hole = alloc_frag->size - alloc_frag->offset; if (hole < len + room) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to * the current buffer. * XDP core assumes that frame_size of xdp_buff and the length * of the frag are PAGE_SIZE, so we disable the hole mechanism. */ if (!headroom) len += hole; alloc_frag->offset += hole; } virtnet_rq_init_one_sg(rq, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); } return err; } /* * Returns false if we couldn't fill entirely (OOM). * * Normally run in the receive path, but can also be run from ndo_open * before we're receiving packets, or from refill_work which is * careful to disable receiving (using napi_disable). */ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, gfp_t gfp) { int err; if (rq->xsk_pool) { err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk_pool, gfp); goto kick; } do { if (vi->mergeable_rx_bufs) err = add_recvbuf_mergeable(vi, rq, gfp); else if (vi->big_packets) err = add_recvbuf_big(vi, rq, gfp); else err = add_recvbuf_small(vi, rq, gfp); if (err) break; } while (rq->vq->num_free); kick: if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) { unsigned long flags; flags = u64_stats_update_begin_irqsave(&rq->stats.syncp); u64_stats_inc(&rq->stats.kicks); u64_stats_update_end_irqrestore(&rq->stats.syncp, flags); } return err != -ENOMEM; } static void skb_recv_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; struct receive_queue *rq = &vi->rq[vq2rxq(rvq)]; rq->calls++; virtqueue_napi_schedule(&rq->napi, rvq); } static void virtnet_napi_do_enable(struct virtqueue *vq, struct napi_struct *napi) { napi_enable(napi); /* If all buffers were filled by other side before we napi_enabled, we * won't get another interrupt, so process any outstanding packets now. * Call local_bh_enable after to trigger softIRQ processing. */ local_bh_disable(); virtqueue_napi_schedule(napi, vq); local_bh_enable(); } static void virtnet_napi_enable(struct receive_queue *rq) { struct virtnet_info *vi = rq->vq->vdev->priv; int qidx = vq2rxq(rq->vq); virtnet_napi_do_enable(rq->vq, &rq->napi); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi); } static void virtnet_napi_tx_enable(struct send_queue *sq) { struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; int qidx = vq2txq(sq->vq); if (!napi->weight) return; /* Tx napi touches cachelines on the cpu handling tx interrupts. Only * enable the feature if this is likely affine with the transmit path. */ if (!vi->affinity_hint_set) { napi->weight = 0; return; } virtnet_napi_do_enable(sq->vq, napi); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi); } static void virtnet_napi_tx_disable(struct send_queue *sq) { struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; int qidx = vq2txq(sq->vq); if (napi->weight) { netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL); napi_disable(napi); } } static void virtnet_napi_disable(struct receive_queue *rq) { struct virtnet_info *vi = rq->vq->vdev->priv; struct napi_struct *napi = &rq->napi; int qidx = vq2rxq(rq->vq); netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL); napi_disable(napi); } static void refill_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, refill.work); bool still_empty; int i; for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; /* * When queue API support is added in the future and the call * below becomes napi_disable_locked, this driver will need to * be refactored. * * One possible solution would be to: * - cancel refill_work with cancel_delayed_work (note: * non-sync) * - cancel refill_work with cancel_delayed_work_sync in * virtnet_remove after the netdev is unregistered * - wrap all of the work in a lock (perhaps the netdev * instance lock) * - check netif_running() and return early to avoid a race */ napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); virtnet_napi_do_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. */ if (still_empty) schedule_delayed_work(&vi->refill, HZ/2); } } static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, struct receive_queue *rq, int budget, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int len; int packets = 0; void *buf; while (packets < budget) { buf = virtqueue_get_buf(rq->vq, &len); if (!buf) break; virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats); packets++; } return packets; } static int virtnet_receive_packets(struct virtnet_info *vi, struct receive_queue *rq, int budget, unsigned int *xdp_xmit, struct virtnet_rq_stats *stats) { unsigned int len; int packets = 0; void *buf; if (!vi->big_packets || vi->mergeable_rx_bufs) { void *ctx; while (packets < budget && (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats); packets++; } } else { while (packets < budget && (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); packets++; } } return packets; } static int virtnet_receive(struct receive_queue *rq, int budget, unsigned int *xdp_xmit) { struct virtnet_info *vi = rq->vq->vdev->priv; struct virtnet_rq_stats stats = {}; int i, packets; if (rq->xsk_pool) packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats); else packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { spin_lock(&vi->refill_lock); if (vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); spin_unlock(&vi->refill_lock); } } u64_stats_set(&stats.packets, packets); u64_stats_update_begin(&rq->stats.syncp); for (i = 0; i < ARRAY_SIZE(virtnet_rq_stats_desc); i++) { size_t offset = virtnet_rq_stats_desc[i].offset; u64_stats_t *item, *src; item = (u64_stats_t *)((u8 *)&rq->stats + offset); src = (u64_stats_t *)((u8 *)&stats + offset); u64_stats_add(item, u64_stats_read(src)); } u64_stats_add(&rq->stats.packets, u64_stats_read(&stats.packets)); u64_stats_add(&rq->stats.bytes, u64_stats_read(&stats.bytes)); u64_stats_update_end(&rq->stats.syncp); return packets; } static void virtnet_poll_cleantx(struct receive_queue *rq, int budget) { struct virtnet_info *vi = rq->vq->vdev->priv; unsigned int index = vq2rxq(rq->vq); struct send_queue *sq = &vi->sq[index]; struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index)) return; if (__netif_tx_trylock(txq)) { if (sq->reset) { __netif_tx_unlock(txq); return; } do { virtqueue_disable_cb(sq->vq); free_old_xmit(sq, txq, !!budget); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); if (sq->vq->num_free >= MAX_SKB_FRAGS + 2 && netif_tx_queue_stopped(txq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); netif_tx_wake_queue(txq); } __netif_tx_unlock(txq); } } static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue *rq) { struct dim_sample cur_sample = {}; if (!rq->packets_in_napi) return; /* Don't need protection when fetching stats, since fetcher and * updater of the stats are in same context */ dim_update_sample(rq->calls, u64_stats_read(&rq->stats.packets), u64_stats_read(&rq->stats.bytes), &cur_sample); net_dim(&rq->dim, &cur_sample); rq->packets_in_napi = 0; } static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = container_of(napi, struct receive_queue, napi); struct virtnet_info *vi = rq->vq->vdev->priv; struct send_queue *sq; unsigned int received; unsigned int xdp_xmit = 0; bool napi_complete; virtnet_poll_cleantx(rq, budget); received = virtnet_receive(rq, budget, &xdp_xmit); rq->packets_in_napi += received; if (xdp_xmit & VIRTIO_XDP_REDIR) xdp_do_flush(); /* Out of packets? */ if (received < budget) { napi_complete = virtqueue_napi_complete(napi, rq->vq, received); /* Intentionally not taking dim_lock here. This may result in a * spurious net_dim call. But if that happens virtnet_rx_dim_work * will not act on the scheduled work. */ if (napi_complete && rq->dim_enabled) virtnet_rx_dim_update(vi, rq); } if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks); u64_stats_update_end(&sq->stats.syncp); } virtnet_xdp_put_sq(vi, sq); } return received; } static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) { virtnet_napi_tx_disable(&vi->sq[qp_index]); virtnet_napi_disable(&vi->rq[qp_index]); xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); } static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) { struct net_device *dev = vi->dev; int err; err = xdp_rxq_info_reg(&vi->rq[qp_index].xdp_rxq, dev, qp_index, vi->rq[qp_index].napi.napi_id); if (err < 0) return err; err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) goto err_xdp_reg_mem_model; virtnet_napi_enable(&vi->rq[qp_index]); virtnet_napi_tx_enable(&vi->sq[qp_index]); return 0; err_xdp_reg_mem_model: xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); return err; } static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim) { if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return; net_dim_work_cancel(dim); } static void virtnet_update_settings(struct virtnet_info *vi) { u32 speed; u8 duplex; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) return; virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed); if (ethtool_validate_speed(speed)) vi->speed = speed; virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex); if (ethtool_validate_duplex(duplex)) vi->duplex = duplex; } static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int i, err; enable_delayed_refill(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); err = virtnet_enable_queue_pair(vi, i); if (err < 0) goto err_enable_qp; } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { if (vi->status & VIRTIO_NET_S_LINK_UP) netif_carrier_on(vi->dev); virtio_config_driver_enable(vi->vdev); } else { vi->status = VIRTIO_NET_S_LINK_UP; netif_carrier_on(dev); } return 0; err_enable_qp: disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); for (i--; i >= 0; i--) { virtnet_disable_queue_pair(vi, i); virtnet_cancel_dim(vi, &vi->rq[i].dim); } return err; } static int virtnet_poll_tx(struct napi_struct *napi, int budget) { struct send_queue *sq = container_of(napi, struct send_queue, napi); struct virtnet_info *vi = sq->vq->vdev->priv; unsigned int index = vq2txq(sq->vq); struct netdev_queue *txq; int opaque, xsk_done = 0; bool done; if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { /* We don't need to enable cb for XDP */ napi_complete_done(napi, 0); return 0; } txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); if (sq->xsk_pool) xsk_done = virtnet_xsk_xmit(sq, sq->xsk_pool, budget); else free_old_xmit(sq, txq, !!budget); if (sq->vq->num_free >= MAX_SKB_FRAGS + 2 && netif_tx_queue_stopped(txq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.wake); u64_stats_update_end(&sq->stats.syncp); netif_tx_wake_queue(txq); } if (xsk_done >= budget) { __netif_tx_unlock(txq); return budget; } opaque = virtqueue_enable_cb_prepare(sq->vq); done = napi_complete_done(napi, 0); if (!done) virtqueue_disable_cb(sq->vq); __netif_tx_unlock(txq); if (done) { if (unlikely(virtqueue_poll(sq->vq, opaque))) { if (napi_schedule_prep(napi)) { __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); __netif_tx_unlock(txq); __napi_schedule(napi); } } } return 0; } static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) { const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; struct virtnet_info *vi = sq->vq->vdev->priv; struct virtio_net_hdr_v1_hash_tunnel *hdr; int num_sg; unsigned hdr_len = vi->hdr_len; bool can_push; pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; /* Even if we can, don't push here yet as this would skew * csum_start offset below. */ if (can_push) hdr = (struct virtio_net_hdr_v1_hash_tunnel *)(skb->data - hdr_len); else hdr = &skb_vnet_common_hdr(skb)->tnl_hdr; if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl, virtio_is_little_endian(vi->vdev), 0)) return -EPROTO; if (vi->mergeable_rx_bufs) hdr->hash_hdr.hdr.num_buffers = 0; sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2)); if (can_push) { __skb_push(skb, hdr_len); num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); if (unlikely(num_sg < 0)) return num_sg; /* Pull header back to avoid skew in tx bytes calculations. */ __skb_pull(skb, hdr_len); } else { sg_set_buf(sq->sg, hdr, hdr_len); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); if (unlikely(num_sg < 0)) return num_sg; num_sg++; } return virtnet_add_outbuf(sq, num_sg, skb, orphan ? VIRTNET_XMIT_TYPE_SKB_ORPHAN : VIRTNET_XMIT_TYPE_SKB); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int qnum = skb_get_queue_mapping(skb); struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; bool kick; if (!use_napi) free_old_xmit(sq, txq, false); else virtqueue_disable_cb(sq->vq); /* timestamp packet in software */ skb_tx_timestamp(skb); /* Try to transmit */ err = xmit_skb(sq, skb, !use_napi); /* This should not happen! */ if (unlikely(err)) { DEV_STATS_INC(dev, tx_fifo_errors); if (net_ratelimit()) dev_warn(&dev->dev, "Unexpected TXQ (%d) queue failure: %d\n", qnum, err); DEV_STATS_INC(dev, tx_dropped); dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Don't wait up for transmitted skbs to be freed. */ if (!use_napi) { skb_orphan(skb); nf_reset_ct(skb); } if (use_napi) tx_may_stop(vi, dev, sq); else check_sq_full_and_disable(vi, dev,sq); kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : !xmit_more || netif_xmit_stopped(txq); if (kick) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks); u64_stats_update_end(&sq->stats.syncp); } } if (use_napi && kick && unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); return NETDEV_TX_OK; } static void __virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) { bool running = netif_running(vi->dev); if (running) { virtnet_napi_disable(rq); virtnet_cancel_dim(vi, &rq->dim); } } static void virtnet_rx_pause_all(struct virtnet_info *vi) { int i; /* * Make sure refill_work does not run concurrently to * avoid napi_disable race which leads to deadlock. */ disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++) __virtnet_rx_pause(vi, &vi->rq[i]); } static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) { /* * Make sure refill_work does not run concurrently to * avoid napi_disable race which leads to deadlock. */ disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); __virtnet_rx_pause(vi, rq); } static void __virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq, bool refill) { bool running = netif_running(vi->dev); bool schedule_refill = false; if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) schedule_refill = true; if (running) virtnet_napi_enable(rq); if (schedule_refill) schedule_delayed_work(&vi->refill, 0); } static void virtnet_rx_resume_all(struct virtnet_info *vi) { int i; enable_delayed_refill(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) __virtnet_rx_resume(vi, &vi->rq[i], true); else __virtnet_rx_resume(vi, &vi->rq[i], false); } } static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) { enable_delayed_refill(vi); __virtnet_rx_resume(vi, rq, true); } static int virtnet_rx_resize(struct virtnet_info *vi, struct receive_queue *rq, u32 ring_num) { int err, qindex; qindex = rq - vi->rq; virtnet_rx_pause(vi, rq); err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); virtnet_rx_resume(vi, rq); return err; } static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) { bool running = netif_running(vi->dev); struct netdev_queue *txq; int qindex; qindex = sq - vi->sq; if (running) virtnet_napi_tx_disable(sq); txq = netdev_get_tx_queue(vi->dev, qindex); /* 1. wait all ximt complete * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue() */ __netif_tx_lock_bh(txq); /* Prevent rx poll from accessing sq. */ sq->reset = true; /* Prevent the upper layer from trying to send packets. */ netif_stop_subqueue(vi->dev, qindex); __netif_tx_unlock_bh(txq); } static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq) { bool running = netif_running(vi->dev); struct netdev_queue *txq; int qindex; qindex = sq - vi->sq; txq = netdev_get_tx_queue(vi->dev, qindex); __netif_tx_lock_bh(txq); sq->reset = false; netif_tx_wake_queue(txq); __netif_tx_unlock_bh(txq); if (running) virtnet_napi_tx_enable(sq); } static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, u32 ring_num) { int qindex, err; if (ring_num <= MAX_SKB_FRAGS + 2) { netdev_err(vi->dev, "tx size (%d) cannot be smaller than %d\n", ring_num, MAX_SKB_FRAGS + 2); return -EINVAL; } qindex = sq - vi->sq; virtnet_tx_pause(vi, sq); err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, virtnet_sq_free_unused_buf_done); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); virtnet_tx_resume(vi, sq); return err; } /* * Send command via the control virtqueue and check status. Commands * supported by the hypervisor, as indicated by feature bits, should * never fail unless improperly formatted. */ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd, struct scatterlist *out, struct scatterlist *in) { struct scatterlist *sgs[5], hdr, stat; u32 out_num = 0, tmp, in_num = 0; bool ok; int ret; /* Caller should know better */ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); mutex_lock(&vi->cvq_lock); vi->ctrl->status = ~0; vi->ctrl->hdr.class = class; vi->ctrl->hdr.cmd = cmd; /* Add header */ sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr)); sgs[out_num++] = &hdr; if (out) sgs[out_num++] = out; /* Add return status. */ sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status)); sgs[out_num + in_num++] = &stat; if (in) sgs[out_num + in_num++] = in; BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); if (ret < 0) { dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret); mutex_unlock(&vi->cvq_lock); return false; } if (unlikely(!virtqueue_kick(vi->cvq))) goto unlock; /* Spin for a response, the kick causes an ioport write, trapping * into the hypervisor, so the request should be handled immediately. */ while (!virtqueue_get_buf(vi->cvq, &tmp) && !virtqueue_is_broken(vi->cvq)) { cond_resched(); cpu_relax(); } unlock: ok = vi->ctrl->status == VIRTIO_NET_OK; mutex_unlock(&vi->cvq_lock); return ok; } static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, struct scatterlist *out) { return virtnet_send_command_reply(vi, class, cmd, out, NULL); } static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi->vdev; int ret; struct sockaddr *addr; struct scatterlist sg; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) return -EOPNOTSUPP; addr = kmemdup(p, sizeof(*addr), GFP_KERNEL); if (!addr) return -ENOMEM; ret = eth_prepare_mac_addr_change(dev, addr); if (ret) goto out; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n"); ret = -EINVAL; goto out; } } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { unsigned int i; /* Naturally, this has an atomicity problem. */ for (i = 0; i < dev->addr_len; i++) virtio_cwrite8(vdev, offsetof(struct virtio_net_config, mac) + i, addr->sa_data[i]); } eth_commit_mac_addr_change(dev, p); ret = 0; out: kfree(addr); return ret; } static void virtnet_stats(struct net_device *dev, struct rtnl_link_stats64 *tot) { struct virtnet_info *vi = netdev_priv(dev); unsigned int start; int i; for (i = 0; i < vi->max_queue_pairs; i++) { u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops; struct receive_queue *rq = &vi->rq[i]; struct send_queue *sq = &vi->sq[i]; do { start = u64_stats_fetch_begin(&sq->stats.syncp); tpackets = u64_stats_read(&sq->stats.packets); tbytes = u64_stats_read(&sq->stats.bytes); terrors = u64_stats_read(&sq->stats.tx_timeouts); } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); do { start = u64_stats_fetch_begin(&rq->stats.syncp); rpackets = u64_stats_read(&rq->stats.packets); rbytes = u64_stats_read(&rq->stats.bytes); rdrops = u64_stats_read(&rq->stats.drops); } while (u64_stats_fetch_retry(&rq->stats.syncp, start)); tot->rx_packets += rpackets; tot->tx_packets += tpackets; tot->rx_bytes += rbytes; tot->tx_bytes += tbytes; tot->rx_dropped += rdrops; tot->tx_errors += terrors; } tot->tx_dropped = DEV_STATS_READ(dev, tx_dropped); tot->tx_fifo_errors = DEV_STATS_READ(dev, tx_fifo_errors); tot->rx_length_errors = DEV_STATS_READ(dev, rx_length_errors); tot->rx_frame_errors = DEV_STATS_READ(dev, rx_frame_errors); } static void virtnet_ack_link_announce(struct virtnet_info *vi) { if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); } static bool virtnet_commit_rss_command(struct virtnet_info *vi); static void virtnet_rss_update_by_qpairs(struct virtnet_info *vi, u16 queue_pairs) { u32 indir_val = 0; int i = 0; for (; i < vi->rss_indir_table_size; ++i) { indir_val = ethtool_rxfh_indir_default(i, queue_pairs); vi->rss_hdr->indirection_table[i] = cpu_to_le16(indir_val); } vi->rss_trailer.max_tx_vq = cpu_to_le16(queue_pairs); } static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) { struct virtio_net_ctrl_mq *mq __free(kfree) = NULL; struct virtio_net_rss_config_hdr *old_rss_hdr; struct virtio_net_rss_config_trailer old_rss_trailer; struct net_device *dev = vi->dev; struct scatterlist sg; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; /* Firstly check if we need update rss. Do updating if both (1) rss enabled and * (2) no user configuration. * * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. */ if (vi->has_rss && !netif_is_rxfh_configured(dev)) { old_rss_hdr = vi->rss_hdr; old_rss_trailer = vi->rss_trailer; vi->rss_hdr = devm_kzalloc(&dev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); if (!vi->rss_hdr) { vi->rss_hdr = old_rss_hdr; return -ENOMEM; } *vi->rss_hdr = *old_rss_hdr; virtnet_rss_update_by_qpairs(vi, queue_pairs); if (!virtnet_commit_rss_command(vi)) { /* restore ctrl_rss if commit_rss_command failed */ devm_kfree(&dev->dev, vi->rss_hdr); vi->rss_hdr = old_rss_hdr; vi->rss_trailer = old_rss_trailer; dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n", queue_pairs); return -EINVAL; } devm_kfree(&dev->dev, old_rss_hdr); goto succ; } mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) return -ENOMEM; mq->virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); sg_init_one(&sg, mq, sizeof(*mq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; } succ: vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */ spin_lock_bh(&vi->refill_lock); if (dev->flags & IFF_UP && vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); spin_unlock_bh(&vi->refill_lock); return 0; } static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int i; /* Make sure NAPI doesn't schedule refill work */ disable_delayed_refill(vi); /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); /* Prevent the config change callback from changing carrier * after close */ virtio_config_driver_disable(vi->vdev); /* Stop getting status/speed updates: we don't care until next * open */ cancel_work_sync(&vi->config_work); for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_disable_queue_pair(vi, i); virtnet_cancel_dim(vi, &vi->rq[i].dim); } netif_carrier_off(dev); return 0; } static void virtnet_rx_mode_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, rx_mode_work); u8 *promisc_allmulti __free(kfree) = NULL; struct net_device *dev = vi->dev; struct scatterlist sg[2]; struct virtio_net_ctrl_mac *mac_data; struct netdev_hw_addr *ha; int uc_count; int mc_count; void *buf; int i; /* We can't dynamically set ndo_set_rx_mode, so return gracefully */ if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; promisc_allmulti = kzalloc(sizeof(*promisc_allmulti), GFP_KERNEL); if (!promisc_allmulti) { dev_warn(&dev->dev, "Failed to set RX mode, no memory.\n"); return; } rtnl_lock(); *promisc_allmulti = !!(dev->flags & IFF_PROMISC); sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, sg)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", *promisc_allmulti ? "en" : "dis"); *promisc_allmulti = !!(dev->flags & IFF_ALLMULTI); sg_init_one(sg, promisc_allmulti, sizeof(*promisc_allmulti)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", *promisc_allmulti ? "en" : "dis"); netif_addr_lock_bh(dev); uc_count = netdev_uc_count(dev); mc_count = netdev_mc_count(dev); /* MAC filter - use one buffer for both lists */ buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + (2 * sizeof(mac_data->entries)), GFP_ATOMIC); mac_data = buf; if (!buf) { netif_addr_unlock_bh(dev); rtnl_unlock(); return; } sg_init_table(sg, 2); /* Store the unicast list and count in the front of the buffer */ mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count); i = 0; netdev_for_each_uc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); sg_set_buf(&sg[0], mac_data, sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); /* multicast list and count fill the end */ mac_data = (void *)&mac_data->macs[uc_count][0]; mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count); i = 0; netdev_for_each_mc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); netif_addr_unlock_bh(dev); sg_set_buf(&sg[1], mac_data, sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, sg)) dev_warn(&dev->dev, "Failed to set MAC filter table.\n"); rtnl_unlock(); kfree(buf); } static void virtnet_set_rx_mode(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); if (vi->rx_mode_work_enabled) schedule_work(&vi->rx_mode_work); } static int virtnet_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); if (!_vid) return -ENOMEM; *_vid = cpu_to_virtio16(vi->vdev, vid); sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_ADD, &sg)) dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); return 0; } static int virtnet_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct virtnet_info *vi = netdev_priv(dev); __virtio16 *_vid __free(kfree) = NULL; struct scatterlist sg; _vid = kzalloc(sizeof(*_vid), GFP_KERNEL); if (!_vid) return -ENOMEM; *_vid = cpu_to_virtio16(vi->vdev, vid); sg_init_one(&sg, _vid, sizeof(*_vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, VIRTIO_NET_CTRL_VLAN_DEL, &sg)) dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0; } static void virtnet_clean_affinity(struct virtnet_info *vi) { int i; if (vi->affinity_hint_set) { for (i = 0; i < vi->max_queue_pairs; i++) { virtqueue_set_affinity(vi->rq[i].vq, NULL); virtqueue_set_affinity(vi->sq[i].vq, NULL); } vi->affinity_hint_set = false; } } static void virtnet_set_affinity(struct virtnet_info *vi) { cpumask_var_t mask; int stragglers; int group_size; int i, start = 0, cpu; int num_cpu; int stride; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { virtnet_clean_affinity(vi); return; } num_cpu = num_online_cpus(); stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1); stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); for_each_online_cpu_wrap(cpu, start) { if (!group_size--) { start = cpu; break; } cpumask_set_cpu(cpu, mask); } virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); cpumask_clear(mask); } vi->affinity_hint_set = true; free_cpumask_var(mask); } static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node); virtnet_set_affinity(vi); return 0; } static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node_dead); virtnet_set_affinity(vi); return 0; } static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node) { struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, node); virtnet_clean_affinity(vi); return 0; } static enum cpuhp_state virtionet_online; static int virtnet_cpu_notif_add(struct virtnet_info *vi) { int ret; ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node); if (ret) return ret; ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD, &vi->node_dead); if (!ret) return ret; cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); return ret; } static void virtnet_cpu_notif_remove(struct virtnet_info *vi) { cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD, &vi->node_dead); } static int virtnet_send_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 vqn, u32 max_usecs, u32 max_packets) { struct virtio_net_ctrl_coal_vq *coal_vq __free(kfree) = NULL; struct scatterlist sgs; coal_vq = kzalloc(sizeof(*coal_vq), GFP_KERNEL); if (!coal_vq) return -ENOMEM; coal_vq->vqn = cpu_to_le16(vqn); coal_vq->coal.max_usecs = cpu_to_le32(max_usecs); coal_vq->coal.max_packets = cpu_to_le32(max_packets); sg_init_one(&sgs, coal_vq, sizeof(*coal_vq)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_VQ_SET, &sgs)) return -EINVAL; return 0; } static int virtnet_send_rx_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 queue, u32 max_usecs, u32 max_packets) { int err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; err = virtnet_send_ctrl_coal_vq_cmd(vi, rxq2vq(queue), max_usecs, max_packets); if (err) return err; vi->rq[queue].intr_coal.max_usecs = max_usecs; vi->rq[queue].intr_coal.max_packets = max_packets; return 0; } static int virtnet_send_tx_ctrl_coal_vq_cmd(struct virtnet_info *vi, u16 queue, u32 max_usecs, u32 max_packets) { int err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; err = virtnet_send_ctrl_coal_vq_cmd(vi, txq2vq(queue), max_usecs, max_packets); if (err) return err; vi->sq[queue].intr_coal.max_usecs = max_usecs; vi->sq[queue].intr_coal.max_packets = max_packets; return 0; } static void virtnet_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); ring->rx_max_pending = vi->rq[0].vq->num_max; ring->tx_max_pending = vi->sq[0].vq->num_max; ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); } static int virtnet_set_ringparam(struct net_device *dev, struct ethtool_ringparam *ring, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); u32 rx_pending, tx_pending; struct receive_queue *rq; struct send_queue *sq; int i, err; if (ring->rx_mini_pending || ring->rx_jumbo_pending) return -EINVAL; rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); if (ring->rx_pending == rx_pending && ring->tx_pending == tx_pending) return 0; if (ring->rx_pending > vi->rq[0].vq->num_max) return -EINVAL; if (ring->tx_pending > vi->sq[0].vq->num_max) return -EINVAL; for (i = 0; i < vi->max_queue_pairs; i++) { rq = vi->rq + i; sq = vi->sq + i; if (ring->tx_pending != tx_pending) { err = virtnet_tx_resize(vi, sq, ring->tx_pending); if (err) return err; /* Upon disabling and re-enabling a transmit virtqueue, the device must * set the coalescing parameters of the virtqueue to those configured * through the VIRTIO_NET_CTRL_NOTF_COAL_TX_SET command, or, if the driver * did not set any TX coalescing parameters, to 0. */ err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_tx.max_usecs, vi->intr_coal_tx.max_packets); /* Don't break the tx resize action if the vq coalescing is not * supported. The same is true for rx resize below. */ if (err && err != -EOPNOTSUPP) return err; } if (ring->rx_pending != rx_pending) { err = virtnet_rx_resize(vi, rq, ring->rx_pending); if (err) return err; /* The reason is same as the transmit virtqueue reset */ mutex_lock(&vi->rq[i].dim_lock); err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_rx.max_usecs, vi->intr_coal_rx.max_packets); mutex_unlock(&vi->rq[i].dim_lock); if (err && err != -EOPNOTSUPP) return err; } } return 0; } static bool virtnet_commit_rss_command(struct virtnet_info *vi) { struct net_device *dev = vi->dev; struct scatterlist sgs[2]; /* prepare sgs */ sg_init_table(sgs, 2); sg_set_buf(&sgs[0], vi->rss_hdr, virtnet_rss_hdr_size(vi)); sg_set_buf(&sgs[1], &vi->rss_trailer, virtnet_rss_trailer_size(vi)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) goto err; return true; err: dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n"); return false; } static void virtnet_init_default_rss(struct virtnet_info *vi) { vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_supported); vi->rss_hash_types_saved = vi->rss_hash_types_supported; vi->rss_hdr->indirection_table_mask = vi->rss_indir_table_size ? cpu_to_le16(vi->rss_indir_table_size - 1) : 0; vi->rss_hdr->unclassified_queue = 0; virtnet_rss_update_by_qpairs(vi, vi->curr_queue_pairs); vi->rss_trailer.hash_key_length = vi->rss_key_size; netdev_rss_key_fill(vi->rss_hash_key_data, vi->rss_key_size); } static int virtnet_get_hashflow(struct net_device *dev, struct ethtool_rxfh_fields *info) { struct virtnet_info *vi = netdev_priv(dev); info->data = 0; switch (info->flow_type) { case TCP_V4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case TCP_V6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case UDP_V4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case UDP_V6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { info->data = RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3; } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { info->data = RXH_IP_SRC | RXH_IP_DST; } break; case IPV4_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) info->data = RXH_IP_SRC | RXH_IP_DST; break; case IPV6_FLOW: if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) info->data = RXH_IP_SRC | RXH_IP_DST; break; default: info->data = 0; break; } return 0; } static int virtnet_set_hashflow(struct net_device *dev, const struct ethtool_rxfh_fields *info, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); u32 new_hashtypes = vi->rss_hash_types_saved; bool is_disable = info->data & RXH_DISCARD; bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3); /* supports only 'sd', 'sdfn' and 'r' */ if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable)) return -EINVAL; switch (info->flow_type) { case TCP_V4_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0); break; case UDP_V4_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0); break; case IPV4_FLOW: new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4; if (!is_disable) new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4; break; case TCP_V6_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0); break; case UDP_V6_FLOW: new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6); if (!is_disable) new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0); break; case IPV6_FLOW: new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6; if (!is_disable) new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6; break; default: /* unsupported flow */ return -EINVAL; } /* if unsupported hashtype was set */ if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported)) return -EINVAL; if (new_hashtypes != vi->rss_hash_types_saved) { vi->rss_hash_types_saved = new_hashtypes; vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); if (vi->dev->features & NETIF_F_RXHASH) if (!virtnet_commit_rss_command(vi)) return -EINVAL; } return 0; } static void virtnet_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi->vdev; strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); strscpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version)); strscpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info)); } /* TODO: Eliminate OOO packets during switching */ static int virtnet_set_channels(struct net_device *dev, struct ethtool_channels *channels) { struct virtnet_info *vi = netdev_priv(dev); u16 queue_pairs = channels->combined_count; int err; /* We don't support separate rx/tx channels. * We don't allow setting 'other' channels. */ if (channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL; if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0) return -EINVAL; /* For now we don't support modifying channels while XDP is loaded * also when XDP is loaded all RX queues have XDP programs so we only * need to check a single RX queue. */ if (vi->rq[0].xdp_prog) return -EINVAL; cpus_read_lock(); err = virtnet_set_queues(vi, queue_pairs); if (err) { cpus_read_unlock(); goto err; } virtnet_set_affinity(vi); cpus_read_unlock(); netif_set_real_num_tx_queues(dev, queue_pairs); netif_set_real_num_rx_queues(dev, queue_pairs); err: return err; } static void virtnet_stats_sprintf(u8 **p, const char *fmt, const char *noq_fmt, int num, int qid, const struct virtnet_stat_desc *desc) { int i; if (qid < 0) { for (i = 0; i < num; ++i) ethtool_sprintf(p, noq_fmt, desc[i].desc); } else { for (i = 0; i < num; ++i) ethtool_sprintf(p, fmt, qid, desc[i].desc); } } /* qid == -1: for rx/tx queue total field */ static void virtnet_get_stats_string(struct virtnet_info *vi, int type, int qid, u8 **data) { const struct virtnet_stat_desc *desc; const char *fmt, *noq_fmt; u8 *p = *data; u32 num; if (type == VIRTNET_Q_TYPE_CQ && qid >= 0) { noq_fmt = "cq_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { desc = &virtnet_stats_cvq_desc[0]; num = ARRAY_SIZE(virtnet_stats_cvq_desc); virtnet_stats_sprintf(&p, NULL, noq_fmt, num, -1, desc); } } if (type == VIRTNET_Q_TYPE_RX) { fmt = "rx%u_%s"; noq_fmt = "rx_%s"; desc = &virtnet_rq_stats_desc[0]; num = ARRAY_SIZE(virtnet_rq_stats_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); fmt = "rx%u_hw_%s"; noq_fmt = "rx_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } } if (type == VIRTNET_Q_TYPE_TX) { fmt = "tx%u_%s"; noq_fmt = "tx_%s"; desc = &virtnet_sq_stats_desc[0]; num = ARRAY_SIZE(virtnet_sq_stats_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); fmt = "tx%u_hw_%s"; noq_fmt = "tx_hw_%s"; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); virtnet_stats_sprintf(&p, fmt, noq_fmt, num, qid, desc); } } *data = p; } struct virtnet_stats_ctx { /* The stats are write to qstats or ethtool -S */ bool to_qstat; /* Used to calculate the offset inside the output buffer. */ u32 desc_num[3]; /* The actual supported stat types. */ u64 bitmap[3]; /* Used to calculate the reply buffer size. */ u32 size[3]; /* Record the output buffer. */ u64 *data; }; static void virtnet_stats_ctx_init(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, u64 *data, bool to_qstat) { u32 queue_type; ctx->data = data; ctx->to_qstat = to_qstat; if (to_qstat) { ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); queue_type = VIRTNET_Q_TYPE_RX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); } queue_type = VIRTNET_Q_TYPE_TX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); } return; } ctx->desc_num[VIRTNET_Q_TYPE_RX] = ARRAY_SIZE(virtnet_rq_stats_desc); ctx->desc_num[VIRTNET_Q_TYPE_TX] = ARRAY_SIZE(virtnet_sq_stats_desc); if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { queue_type = VIRTNET_Q_TYPE_CQ; ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_CVQ; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_cvq_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_cvq); } queue_type = VIRTNET_Q_TYPE_RX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_basic_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_CSUM; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_csum_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_csum); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_RX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_rx_speed_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_rx_speed); } queue_type = VIRTNET_Q_TYPE_TX; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_BASIC; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_basic_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_basic); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_GSO; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_gso_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_gso); } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_TX_SPEED; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_tx_speed_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_tx_speed); } } /* stats_sum_queue - Calculate the sum of the same fields in sq or rq. * @sum: the position to store the sum values * @num: field num * @q_value: the first queue fields * @q_num: number of the queues */ static void stats_sum_queue(u64 *sum, u32 num, u64 *q_value, u32 q_num) { u32 step = num; int i, j; u64 *p; for (i = 0; i < num; ++i) { p = sum + i; *p = 0; for (j = 0; j < q_num; ++j) *p += *(q_value + i + j * step); } } static void virtnet_fill_total_fields(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx) { u64 *data, *first_rx_q, *first_tx_q; u32 num_cq, num_rx, num_tx; num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; first_rx_q = ctx->data + num_rx + num_tx + num_cq; first_tx_q = first_rx_q + vi->curr_queue_pairs * num_rx; data = ctx->data; stats_sum_queue(data, num_rx, first_rx_q, vi->curr_queue_pairs); data = ctx->data + num_rx; stats_sum_queue(data, num_tx, first_tx_q, vi->curr_queue_pairs); } static void virtnet_fill_stats_qstat(struct virtnet_info *vi, u32 qid, struct virtnet_stats_ctx *ctx, const u8 *base, bool drv_stats, u8 reply_type) { const struct virtnet_stat_desc *desc; const u64_stats_t *v_stat; u64 offset, bitmap; const __le64 *v; u32 queue_type; int i, num; queue_type = vq_type(vi, qid); bitmap = ctx->bitmap[queue_type]; if (drv_stats) { if (queue_type == VIRTNET_Q_TYPE_RX) { desc = &virtnet_rq_stats_desc_qstat[0]; num = ARRAY_SIZE(virtnet_rq_stats_desc_qstat); } else { desc = &virtnet_sq_stats_desc_qstat[0]; num = ARRAY_SIZE(virtnet_sq_stats_desc_qstat); } for (i = 0; i < num; ++i) { offset = desc[i].qstat_offset / sizeof(*ctx->data); v_stat = (const u64_stats_t *)(base + desc[i].offset); ctx->data[offset] = u64_stats_read(v_stat); } return; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_GSO) { desc = &virtnet_stats_rx_gso_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_GSO) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { desc = &virtnet_stats_tx_csum_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_csum_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_CSUM) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) goto found; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc_qstat[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc_qstat); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) goto found; } return; found: for (i = 0; i < num; ++i) { offset = desc[i].qstat_offset / sizeof(*ctx->data); v = (const __le64 *)(base + desc[i].offset); ctx->data[offset] = le64_to_cpu(*v); } } /* virtnet_fill_stats - copy the stats to qstats or ethtool -S * The stats source is the device or the driver. * * @vi: virtio net info * @qid: the vq id * @ctx: stats ctx (initiated by virtnet_stats_ctx_init()) * @base: pointer to the device reply or the driver stats structure. * @drv_stats: designate the base type (device reply, driver stats) * @type: the type of the device reply (if drv_stats is true, this must be zero) */ static void virtnet_fill_stats(struct virtnet_info *vi, u32 qid, struct virtnet_stats_ctx *ctx, const u8 *base, bool drv_stats, u8 reply_type) { u32 queue_type, num_rx, num_tx, num_cq; const struct virtnet_stat_desc *desc; const u64_stats_t *v_stat; u64 offset, bitmap; const __le64 *v; int i, num; if (ctx->to_qstat) return virtnet_fill_stats_qstat(vi, qid, ctx, base, drv_stats, reply_type); num_cq = ctx->desc_num[VIRTNET_Q_TYPE_CQ]; num_rx = ctx->desc_num[VIRTNET_Q_TYPE_RX]; num_tx = ctx->desc_num[VIRTNET_Q_TYPE_TX]; queue_type = vq_type(vi, qid); bitmap = ctx->bitmap[queue_type]; /* skip the total fields of pairs */ offset = num_rx + num_tx; if (queue_type == VIRTNET_Q_TYPE_TX) { offset += num_cq + num_rx * vi->curr_queue_pairs + num_tx * (qid / 2); num = ARRAY_SIZE(virtnet_sq_stats_desc); if (drv_stats) { desc = &virtnet_sq_stats_desc[0]; goto drv_stats; } offset += num; } else if (queue_type == VIRTNET_Q_TYPE_RX) { offset += num_cq + num_rx * (qid / 2); num = ARRAY_SIZE(virtnet_rq_stats_desc); if (drv_stats) { desc = &virtnet_rq_stats_desc[0]; goto drv_stats; } offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_CVQ) { desc = &virtnet_stats_cvq_desc[0]; num = ARRAY_SIZE(virtnet_stats_cvq_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_CVQ) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { desc = &virtnet_stats_rx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_basic_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { desc = &virtnet_stats_rx_csum_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_csum_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_RX_SPEED) { desc = &virtnet_stats_rx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_rx_speed_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { desc = &virtnet_stats_tx_basic_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_basic_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_GSO) { desc = &virtnet_stats_tx_gso_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_gso_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO) goto found; offset += num; } if (bitmap & VIRTIO_NET_STATS_TYPE_TX_SPEED) { desc = &virtnet_stats_tx_speed_desc[0]; num = ARRAY_SIZE(virtnet_stats_tx_speed_desc); if (reply_type == VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED) goto found; offset += num; } return; found: for (i = 0; i < num; ++i) { v = (const __le64 *)(base + desc[i].offset); ctx->data[offset + i] = le64_to_cpu(*v); } return; drv_stats: for (i = 0; i < num; ++i) { v_stat = (const u64_stats_t *)(base + desc[i].offset); ctx->data[offset + i] = u64_stats_read(v_stat); } } static int __virtnet_get_hw_stats(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, struct virtio_net_ctrl_queue_stats *req, int req_size, void *reply, int res_size) { struct virtio_net_stats_reply_hdr *hdr; struct scatterlist sgs_in, sgs_out; void *p; u32 qid; int ok; sg_init_one(&sgs_out, req, req_size); sg_init_one(&sgs_in, reply, res_size); ok = virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, VIRTIO_NET_CTRL_STATS_GET, &sgs_out, &sgs_in); if (!ok) return ok; for (p = reply; p - reply < res_size; p += le16_to_cpu(hdr->size)) { hdr = p; qid = le16_to_cpu(hdr->vq_index); virtnet_fill_stats(vi, qid, ctx, p, false, hdr->type); } return 0; } static void virtnet_make_stat_req(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, struct virtio_net_ctrl_queue_stats *req, int qid, int *idx) { int qtype = vq_type(vi, qid); u64 bitmap = ctx->bitmap[qtype]; if (!bitmap) return; req->stats[*idx].vq_index = cpu_to_le16(qid); req->stats[*idx].types_bitmap[0] = cpu_to_le64(bitmap); *idx += 1; } /* qid: -1: get stats of all vq. * > 0: get the stats for the special vq. This must not be cvq. */ static int virtnet_get_hw_stats(struct virtnet_info *vi, struct virtnet_stats_ctx *ctx, int qid) { int qnum, i, j, res_size, qtype, last_vq, first_vq; struct virtio_net_ctrl_queue_stats *req; bool enable_cvq; void *reply; int ok; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) return 0; if (qid == -1) { last_vq = vi->curr_queue_pairs * 2 - 1; first_vq = 0; enable_cvq = true; } else { last_vq = qid; first_vq = qid; enable_cvq = false; } qnum = 0; res_size = 0; for (i = first_vq; i <= last_vq ; ++i) { qtype = vq_type(vi, i); if (ctx->bitmap[qtype]) { ++qnum; res_size += ctx->size[qtype]; } } if (enable_cvq && ctx->bitmap[VIRTNET_Q_TYPE_CQ]) { res_size += ctx->size[VIRTNET_Q_TYPE_CQ]; qnum += 1; } req = kcalloc(qnum, sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; reply = kmalloc(res_size, GFP_KERNEL); if (!reply) { kfree(req); return -ENOMEM; } j = 0; for (i = first_vq; i <= last_vq ; ++i) virtnet_make_stat_req(vi, ctx, req, i, &j); if (enable_cvq) virtnet_make_stat_req(vi, ctx, req, vi->max_queue_pairs * 2, &j); ok = __virtnet_get_hw_stats(vi, ctx, req, sizeof(*req) * j, reply, res_size); kfree(req); kfree(reply); return ok; } static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) { struct virtnet_info *vi = netdev_priv(dev); unsigned int i; u8 *p = data; switch (stringset) { case ETH_SS_STATS: /* Generate the total field names. */ virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, -1, &p); virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, -1, &p); virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_CQ, 0, &p); for (i = 0; i < vi->curr_queue_pairs; ++i) virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_RX, i, &p); for (i = 0; i < vi->curr_queue_pairs; ++i) virtnet_get_stats_string(vi, VIRTNET_Q_TYPE_TX, i, &p); break; } } static int virtnet_get_sset_count(struct net_device *dev, int sset) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_stats_ctx ctx = {0}; u32 pair_count; switch (sset) { case ETH_SS_STATS: virtnet_stats_ctx_init(vi, &ctx, NULL, false); pair_count = ctx.desc_num[VIRTNET_Q_TYPE_RX] + ctx.desc_num[VIRTNET_Q_TYPE_TX]; return pair_count + ctx.desc_num[VIRTNET_Q_TYPE_CQ] + vi->curr_queue_pairs * pair_count; default: return -EOPNOTSUPP; } } static void virtnet_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct virtnet_info *vi = netdev_priv(dev); struct virtnet_stats_ctx ctx = {0}; unsigned int start, i; const u8 *stats_base; virtnet_stats_ctx_init(vi, &ctx, data, false); if (virtnet_get_hw_stats(vi, &ctx, -1)) dev_warn(&vi->dev->dev, "Failed to get hw stats.\n"); for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; struct send_queue *sq = &vi->sq[i]; stats_base = (const u8 *)&rq->stats; do { start = u64_stats_fetch_begin(&rq->stats.syncp); virtnet_fill_stats(vi, i * 2, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&rq->stats.syncp, start)); stats_base = (const u8 *)&sq->stats; do { start = u64_stats_fetch_begin(&sq->stats.syncp); virtnet_fill_stats(vi, i * 2 + 1, &ctx, stats_base, true, 0); } while (u64_stats_fetch_retry(&sq->stats.syncp, start)); } virtnet_fill_total_fields(vi, &ctx); } static void virtnet_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct virtnet_info *vi = netdev_priv(dev); channels->combined_count = vi->curr_queue_pairs; channels->max_combined = vi->max_queue_pairs; channels->max_other = 0; channels->rx_count = 0; channels->tx_count = 0; channels->other_count = 0; } static int virtnet_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct virtnet_info *vi = netdev_priv(dev); return ethtool_virtdev_set_link_ksettings(dev, cmd, &vi->speed, &vi->duplex); } static int virtnet_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct virtnet_info *vi = netdev_priv(dev); cmd->base.speed = vi->speed; cmd->base.duplex = vi->duplex; cmd->base.port = PORT_OTHER; return 0; } static int virtnet_send_tx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { struct virtio_net_ctrl_coal_tx *coal_tx __free(kfree) = NULL; struct scatterlist sgs_tx; int i; coal_tx = kzalloc(sizeof(*coal_tx), GFP_KERNEL); if (!coal_tx) return -ENOMEM; coal_tx->tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); coal_tx->tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); sg_init_one(&sgs_tx, coal_tx, sizeof(*coal_tx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, &sgs_tx)) return -EINVAL; vi->intr_coal_tx.max_usecs = ec->tx_coalesce_usecs; vi->intr_coal_tx.max_packets = ec->tx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { vi->sq[i].intr_coal.max_usecs = ec->tx_coalesce_usecs; vi->sq[i].intr_coal.max_packets = ec->tx_max_coalesced_frames; } return 0; } static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { struct virtio_net_ctrl_coal_rx *coal_rx __free(kfree) = NULL; bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; struct scatterlist sgs_rx; int i; if (rx_ctrl_dim_on && !virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return -EOPNOTSUPP; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != vi->intr_coal_rx.max_usecs || ec->rx_max_coalesced_frames != vi->intr_coal_rx.max_packets)) return -EINVAL; if (rx_ctrl_dim_on && !vi->rx_dim_enabled) { vi->rx_dim_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = true; mutex_unlock(&vi->rq[i].dim_lock); } return 0; } coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); if (!coal_rx) return -ENOMEM; if (!rx_ctrl_dim_on && vi->rx_dim_enabled) { vi->rx_dim_enabled = false; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].dim_enabled = false; mutex_unlock(&vi->rq[i].dim_lock); } } /* Since the per-queue coalescing params can be set, * we need apply the global new params even if they * are not updated. */ coal_rx->rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); coal_rx->rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); sg_init_one(&sgs_rx, coal_rx, sizeof(*coal_rx)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, &sgs_rx)) return -EINVAL; vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs; vi->intr_coal_rx.max_packets = ec->rx_max_coalesced_frames; for (i = 0; i < vi->max_queue_pairs; i++) { mutex_lock(&vi->rq[i].dim_lock); vi->rq[i].intr_coal.max_usecs = ec->rx_coalesce_usecs; vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames; mutex_unlock(&vi->rq[i].dim_lock); } return 0; } static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec) { int err; err = virtnet_send_tx_notf_coal_cmds(vi, ec); if (err) return err; err = virtnet_send_rx_notf_coal_cmds(vi, ec); if (err) return err; return 0; } static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec, u16 queue) { bool rx_ctrl_dim_on = !!ec->use_adaptive_rx_coalesce; u32 max_usecs, max_packets; bool cur_rx_dim; int err; mutex_lock(&vi->rq[queue].dim_lock); cur_rx_dim = vi->rq[queue].dim_enabled; max_usecs = vi->rq[queue].intr_coal.max_usecs; max_packets = vi->rq[queue].intr_coal.max_packets; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != max_usecs || ec->rx_max_coalesced_frames != max_packets)) { mutex_unlock(&vi->rq[queue].dim_lock); return -EINVAL; } if (rx_ctrl_dim_on && !cur_rx_dim) { vi->rq[queue].dim_enabled = true; mutex_unlock(&vi->rq[queue].dim_lock); return 0; } if (!rx_ctrl_dim_on && cur_rx_dim) vi->rq[queue].dim_enabled = false; /* If no params are updated, userspace ethtool will * reject the modification. */ err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, queue, ec->rx_coalesce_usecs, ec->rx_max_coalesced_frames); mutex_unlock(&vi->rq[queue].dim_lock); return err; } static int virtnet_send_notf_coal_vq_cmds(struct virtnet_info *vi, struct ethtool_coalesce *ec, u16 queue) { int err; err = virtnet_send_rx_notf_coal_vq_cmds(vi, ec, queue); if (err) return err; err = virtnet_send_tx_ctrl_coal_vq_cmd(vi, queue, ec->tx_coalesce_usecs, ec->tx_max_coalesced_frames); if (err) return err; return 0; } static void virtnet_rx_dim_work(struct work_struct *work) { struct dim *dim = container_of(work, struct dim, work); struct receive_queue *rq = container_of(dim, struct receive_queue, dim); struct virtnet_info *vi = rq->vq->vdev->priv; struct net_device *dev = vi->dev; struct dim_cq_moder update_moder; int qnum, err; qnum = rq - vi->rq; mutex_lock(&rq->dim_lock); if (!rq->dim_enabled) goto out; update_moder = net_dim_get_rx_irq_moder(dev, dim); if (update_moder.usec != rq->intr_coal.max_usecs || update_moder.pkts != rq->intr_coal.max_packets) { err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, update_moder.usec, update_moder.pkts); if (err) pr_debug("%s: Failed to send dim parameters on rxq%d\n", dev->name, qnum); } out: dim->state = DIM_START_MEASURE; mutex_unlock(&rq->dim_lock); } static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) { /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL * or VIRTIO_NET_F_VQ_NOTF_COAL feature is negotiated. */ if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs) return -EOPNOTSUPP; if (ec->tx_max_coalesced_frames > 1 || ec->rx_max_coalesced_frames != 1) return -EINVAL; return 0; } static int virtnet_should_update_vq_weight(int dev_flags, int weight, int vq_weight, bool *should_update) { if (weight ^ vq_weight) { if (dev_flags & IFF_UP) return -EBUSY; *should_update = true; } return 0; } static int virtnet_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); int ret, queue_number, napi_weight, i; bool update_napi = false; /* Can't change NAPI weight if the link is up */ napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; for (queue_number = 0; queue_number < vi->max_queue_pairs; queue_number++) { ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, vi->sq[queue_number].napi.weight, &update_napi); if (ret) return ret; if (update_napi) { /* All queues that belong to [queue_number, vi->max_queue_pairs] will be * updated for the sake of simplicity, which might not be necessary */ break; } } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) ret = virtnet_send_notf_coal_cmds(vi, ec); else ret = virtnet_coal_params_supported(ec); if (ret) return ret; if (update_napi) { /* xsk xmit depends on the tx napi. So if xsk is active, * prevent modifications to tx napi. */ for (i = queue_number; i < vi->max_queue_pairs; i++) { if (vi->sq[i].xsk_pool) return -EBUSY; } for (; queue_number < vi->max_queue_pairs; queue_number++) vi->sq[queue_number].napi.weight = napi_weight; } return ret; } static int virtnet_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { ec->rx_coalesce_usecs = vi->intr_coal_rx.max_usecs; ec->tx_coalesce_usecs = vi->intr_coal_tx.max_usecs; ec->tx_max_coalesced_frames = vi->intr_coal_tx.max_packets; ec->rx_max_coalesced_frames = vi->intr_coal_rx.max_packets; ec->use_adaptive_rx_coalesce = vi->rx_dim_enabled; } else { ec->rx_max_coalesced_frames = 1; if (vi->sq[0].napi.weight) ec->tx_max_coalesced_frames = 1; } return 0; } static int virtnet_set_per_queue_coalesce(struct net_device *dev, u32 queue, struct ethtool_coalesce *ec) { struct virtnet_info *vi = netdev_priv(dev); int ret, napi_weight; bool update_napi = false; if (queue >= vi->max_queue_pairs) return -EINVAL; /* Can't change NAPI weight if the link is up */ napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; ret = virtnet_should_update_vq_weight(dev->flags, napi_weight, vi->sq[queue].napi.weight, &update_napi); if (ret) return ret; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) ret = virtnet_send_notf_coal_vq_cmds(vi, ec, queue); else ret = virtnet_coal_params_supported(ec); if (ret) return ret; if (update_napi) vi->sq[queue].napi.weight = napi_weight; return 0; } static int virtnet_get_per_queue_coalesce(struct net_device *dev, u32 queue, struct ethtool_coalesce *ec) { struct virtnet_info *vi = netdev_priv(dev); if (queue >= vi->max_queue_pairs) return -EINVAL; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { mutex_lock(&vi->rq[queue].dim_lock); ec->rx_coalesce_usecs = vi->rq[queue].intr_coal.max_usecs; ec->tx_coalesce_usecs = vi->sq[queue].intr_coal.max_usecs; ec->tx_max_coalesced_frames = vi->sq[queue].intr_coal.max_packets; ec->rx_max_coalesced_frames = vi->rq[queue].intr_coal.max_packets; ec->use_adaptive_rx_coalesce = vi->rq[queue].dim_enabled; mutex_unlock(&vi->rq[queue].dim_lock); } else { ec->rx_max_coalesced_frames = 1; if (vi->sq[queue].napi.weight) ec->tx_max_coalesced_frames = 1; } return 0; } static void virtnet_init_settings(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); vi->speed = SPEED_UNKNOWN; vi->duplex = DUPLEX_UNKNOWN; } static u32 virtnet_get_rxfh_key_size(struct net_device *dev) { return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size; } static u32 virtnet_get_rxfh_indir_size(struct net_device *dev) { return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size; } static int virtnet_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh) { struct virtnet_info *vi = netdev_priv(dev); int i; if (rxfh->indir) { for (i = 0; i < vi->rss_indir_table_size; ++i) rxfh->indir[i] = le16_to_cpu(vi->rss_hdr->indirection_table[i]); } if (rxfh->key) memcpy(rxfh->key, vi->rss_hash_key_data, vi->rss_key_size); rxfh->hfunc = ETH_RSS_HASH_TOP; return 0; } static int virtnet_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); bool update = false; int i; if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP; if (rxfh->indir) { if (!vi->has_rss) return -EOPNOTSUPP; for (i = 0; i < vi->rss_indir_table_size; ++i) vi->rss_hdr->indirection_table[i] = cpu_to_le16(rxfh->indir[i]); update = true; } if (rxfh->key) { /* If either _F_HASH_REPORT or _F_RSS are negotiated, the * device provides hash calculation capabilities, that is, * hash_key is configured. */ if (!vi->has_rss && !vi->has_rss_hash_report) return -EOPNOTSUPP; memcpy(vi->rss_hash_key_data, rxfh->key, vi->rss_key_size); update = true; } if (update) virtnet_commit_rss_command(vi); return 0; } static u32 virtnet_get_rx_ring_count(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); return vi->curr_queue_pairs; } static const struct ethtool_ops virtnet_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES | ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_USE_ADAPTIVE_RX, .get_drvinfo = virtnet_get_drvinfo, .get_link = ethtool_op_get_link, .get_ringparam = virtnet_get_ringparam, .set_ringparam = virtnet_set_ringparam, .get_strings = virtnet_get_strings, .get_sset_count = virtnet_get_sset_count, .get_ethtool_stats = virtnet_get_ethtool_stats, .set_channels = virtnet_set_channels, .get_channels = virtnet_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_link_ksettings = virtnet_get_link_ksettings, .set_link_ksettings = virtnet_set_link_ksettings, .set_coalesce = virtnet_set_coalesce, .get_coalesce = virtnet_get_coalesce, .set_per_queue_coalesce = virtnet_set_per_queue_coalesce, .get_per_queue_coalesce = virtnet_get_per_queue_coalesce, .get_rxfh_key_size = virtnet_get_rxfh_key_size, .get_rxfh_indir_size = virtnet_get_rxfh_indir_size, .get_rxfh = virtnet_get_rxfh, .set_rxfh = virtnet_set_rxfh, .get_rxfh_fields = virtnet_get_hashflow, .set_rxfh_fields = virtnet_set_hashflow, .get_rx_ring_count = virtnet_get_rx_ring_count, }; static void virtnet_get_queue_stats_rx(struct net_device *dev, int i, struct netdev_queue_stats_rx *stats) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = &vi->rq[i]; struct virtnet_stats_ctx ctx = {0}; virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); virtnet_get_hw_stats(vi, &ctx, i * 2); virtnet_fill_stats(vi, i * 2, &ctx, (void *)&rq->stats, true, 0); } static void virtnet_get_queue_stats_tx(struct net_device *dev, int i, struct netdev_queue_stats_tx *stats) { struct virtnet_info *vi = netdev_priv(dev); struct send_queue *sq = &vi->sq[i]; struct virtnet_stats_ctx ctx = {0}; virtnet_stats_ctx_init(vi, &ctx, (void *)stats, true); virtnet_get_hw_stats(vi, &ctx, i * 2 + 1); virtnet_fill_stats(vi, i * 2 + 1, &ctx, (void *)&sq->stats, true, 0); } static void virtnet_get_base_stats(struct net_device *dev, struct netdev_queue_stats_rx *rx, struct netdev_queue_stats_tx *tx) { struct virtnet_info *vi = netdev_priv(dev); /* The queue stats of the virtio-net will not be reset. So here we * return 0. */ rx->bytes = 0; rx->packets = 0; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_BASIC) { rx->hw_drops = 0; rx->hw_drop_overruns = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_CSUM) { rx->csum_unnecessary = 0; rx->csum_none = 0; rx->csum_bad = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_GSO) { rx->hw_gro_packets = 0; rx->hw_gro_bytes = 0; rx->hw_gro_wire_packets = 0; rx->hw_gro_wire_bytes = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_RX_SPEED) rx->hw_drop_ratelimits = 0; tx->bytes = 0; tx->packets = 0; tx->stop = 0; tx->wake = 0; if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_BASIC) { tx->hw_drops = 0; tx->hw_drop_errors = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_CSUM) { tx->csum_none = 0; tx->needs_csum = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_GSO) { tx->hw_gso_packets = 0; tx->hw_gso_bytes = 0; tx->hw_gso_wire_packets = 0; tx->hw_gso_wire_bytes = 0; } if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_TX_SPEED) tx->hw_drop_ratelimits = 0; netdev_stat_queue_sum(dev, dev->real_num_rx_queues, vi->max_queue_pairs, rx, dev->real_num_tx_queues, vi->max_queue_pairs, tx); } static const struct netdev_stat_ops virtnet_stat_ops = { .get_queue_stats_rx = virtnet_get_queue_stats_rx, .get_queue_stats_tx = virtnet_get_queue_stats_tx, .get_base_stats = virtnet_get_base_stats, }; static void virtnet_freeze_down(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); if (netif_running(vi->dev)) { rtnl_lock(); virtnet_close(vi->dev); rtnl_unlock(); } netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); } static int init_vqs(struct virtnet_info *vi); static int virtnet_restore_up(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; int err; err = init_vqs(vi); if (err) return err; virtio_device_ready(vdev); enable_delayed_refill(vi); enable_rx_mode_work(vi); if (netif_running(vi->dev)) { rtnl_lock(); err = virtnet_open(vi->dev); rtnl_unlock(); if (err) return err; } netif_tx_lock_bh(vi->dev); netif_device_attach(vi->dev); netif_tx_unlock_bh(vi->dev); return err; } static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) { __virtio64 *_offloads __free(kfree) = NULL; struct scatterlist sg; _offloads = kzalloc(sizeof(*_offloads), GFP_KERNEL); if (!_offloads) return -ENOMEM; *_offloads = cpu_to_virtio64(vi->vdev, offloads); sg_init_one(&sg, _offloads, sizeof(*_offloads)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { dev_warn(&vi->dev->dev, "Fail to set guest offload.\n"); return -EINVAL; } return 0; } static int virtnet_clear_guest_offloads(struct virtnet_info *vi) { u64 offloads = 0; if (!vi->guest_offloads) return 0; return virtnet_set_guest_offloads(vi, offloads); } static int virtnet_restore_guest_offloads(struct virtnet_info *vi) { u64 offloads = vi->guest_offloads; if (!vi->guest_offloads) return 0; return virtnet_set_guest_offloads(vi, offloads); } static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, struct xsk_buff_pool *pool) { int err, qindex; qindex = rq - vi->rq; if (pool) { err = xdp_rxq_info_reg(&rq->xsk_rxq_info, vi->dev, qindex, rq->napi.napi_id); if (err < 0) return err; err = xdp_rxq_info_reg_mem_model(&rq->xsk_rxq_info, MEM_TYPE_XSK_BUFF_POOL, NULL); if (err < 0) goto unreg; xsk_pool_set_rxq_info(pool, &rq->xsk_rxq_info); } virtnet_rx_pause(vi, rq); err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); if (err) { netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); pool = NULL; } rq->xsk_pool = pool; virtnet_rx_resume(vi, rq); if (pool) return 0; unreg: xdp_rxq_info_unreg(&rq->xsk_rxq_info); return err; } static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, struct send_queue *sq, struct xsk_buff_pool *pool) { int err, qindex; qindex = sq - vi->sq; virtnet_tx_pause(vi, sq); err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, virtnet_sq_free_unused_buf_done); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; } sq->xsk_pool = pool; virtnet_tx_resume(vi, sq); return err; } static int virtnet_xsk_pool_enable(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq; struct device *dma_dev; struct send_queue *sq; dma_addr_t hdr_dma; int err, size; if (vi->hdr_len > xsk_pool_get_headroom(pool)) return -EINVAL; /* In big_packets mode, xdp cannot work, so there is no need to * initialize xsk of rq. */ if (vi->big_packets && !vi->mergeable_rx_bufs) return -ENOENT; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; rq = &vi->rq[qid]; /* xsk assumes that tx and rx must have the same dma device. The af-xdp * may use one buffer to receive from the rx and reuse this buffer to * send by the tx. So the dma dev of sq and rq must be the same one. * * But vq->dma_dev allows every vq has the respective dma dev. So I * check the dma dev of vq and sq is the same dev. */ if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) return -EINVAL; dma_dev = virtqueue_dma_dev(rq->vq); if (!dma_dev) return -EINVAL; size = virtqueue_get_vring_size(rq->vq); rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL); if (!rq->xsk_buffs) return -ENOMEM; hdr_dma = virtqueue_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE, 0); if (virtqueue_map_mapping_error(sq->vq, hdr_dma)) { err = -ENOMEM; goto err_free_buffs; } err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) goto err_xsk_map; err = virtnet_rq_bind_xsk_pool(vi, rq, pool); if (err) goto err_rq; err = virtnet_sq_bind_xsk_pool(vi, sq, pool); if (err) goto err_sq; /* Now, we do not support tx offload(such as tx csum), so all the tx * virtnet hdr is zero. So all the tx packets can share a single hdr. */ sq->xsk_hdr_dma_addr = hdr_dma; return 0; err_sq: virtnet_rq_bind_xsk_pool(vi, rq, NULL); err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: virtqueue_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, DMA_TO_DEVICE, 0); err_free_buffs: kvfree(rq->xsk_buffs); return err; } static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) { struct virtnet_info *vi = netdev_priv(dev); struct xsk_buff_pool *pool; struct receive_queue *rq; struct send_queue *sq; int err; if (qid >= vi->curr_queue_pairs) return -EINVAL; sq = &vi->sq[qid]; rq = &vi->rq[qid]; pool = rq->xsk_pool; err = virtnet_rq_bind_xsk_pool(vi, rq, NULL); err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL); xsk_pool_dma_unmap(pool, 0); virtqueue_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; } static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) { if (xdp->xsk.pool) return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id); else return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); } static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { unsigned int room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + sizeof(struct skb_shared_info)); unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN; struct virtnet_info *vi = netdev_priv(dev); struct bpf_prog *old_prog; u16 xdp_qp = 0, curr_qp; int i, err; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) { NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first"); return -EOPNOTSUPP; } if (vi->mergeable_rx_bufs && !vi->any_header_sg) { NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); return -EINVAL; } if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) { NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags"); netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz); return -EINVAL; } curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; if (prog) xdp_qp = nr_cpu_ids; /* XDP requires extra queues for XDP_TX */ if (curr_qp + xdp_qp > vi->max_queue_pairs) { netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n", curr_qp + xdp_qp, vi->max_queue_pairs); xdp_qp = 0; } old_prog = rtnl_dereference(vi->rq[0].xdp_prog); if (!prog && !old_prog) return 0; if (prog) bpf_prog_add(prog, vi->max_queue_pairs - 1); virtnet_rx_pause_all(vi); /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++) virtnet_napi_tx_disable(&vi->sq[i]); } if (!prog) { for (i = 0; i < vi->max_queue_pairs; i++) { rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0) virtnet_restore_guest_offloads(vi); } synchronize_net(); } err = virtnet_set_queues(vi, curr_qp + xdp_qp); if (err) goto err; netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); vi->xdp_queue_pairs = xdp_qp; if (prog) { vi->xdp_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) { rcu_assign_pointer(vi->rq[i].xdp_prog, prog); if (i == 0 && !old_prog) virtnet_clear_guest_offloads(vi); } if (!old_prog) xdp_features_set_redirect_target(dev, true); } else { xdp_features_clear_redirect_target(dev); vi->xdp_enabled = false; } virtnet_rx_resume_all(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog) bpf_prog_put(old_prog); if (netif_running(dev)) virtnet_napi_tx_enable(&vi->sq[i]); } return 0; err: if (!prog) { virtnet_clear_guest_offloads(vi); for (i = 0; i < vi->max_queue_pairs; i++) rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog); } virtnet_rx_resume_all(vi); if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++) virtnet_napi_tx_enable(&vi->sq[i]); } if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); return err; } static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return virtnet_xdp_set(dev, xdp->prog, xdp->extack); case XDP_SETUP_XSK_POOL: return virtnet_xsk_pool_setup(dev, xdp); default: return -EINVAL; } } static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, size_t len) { struct virtnet_info *vi = netdev_priv(dev); int ret; if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) return -EOPNOTSUPP; ret = snprintf(buf, len, "sby"); if (ret >= len) return -EOPNOTSUPP; return 0; } static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); u64 offloads; int err; if ((dev->features ^ features) & NETIF_F_GRO_HW) { if (vi->xdp_enabled) return -EBUSY; if (features & NETIF_F_GRO_HW) offloads = vi->guest_offloads_capable; else offloads = vi->guest_offloads_capable & ~GUEST_OFFLOAD_GRO_HW_MASK; err = virtnet_set_guest_offloads(vi, offloads); if (err) return err; vi->guest_offloads = offloads; } if ((dev->features ^ features) & NETIF_F_RXHASH) { if (features & NETIF_F_RXHASH) vi->rss_hdr->hash_types = cpu_to_le32(vi->rss_hash_types_saved); else vi->rss_hdr->hash_types = cpu_to_le32(VIRTIO_NET_HASH_REPORT_NONE); if (!virtnet_commit_rss_command(vi)) return -EINVAL; } return 0; } static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct virtnet_info *priv = netdev_priv(dev); struct send_queue *sq = &priv->sq[txqueue]; struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.tx_timeouts); u64_stats_update_end(&sq->stats.syncp); netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n", txqueue, sq->name, sq->vq->index, sq->vq->name, jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start))); } static int virtnet_init_irq_moder(struct virtnet_info *vi) { u8 profile_flags = 0, coal_flags = 0; int ret, i; profile_flags |= DIM_PROFILE_RX; coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS; ret = net_dim_init_irq_moder(vi->dev, profile_flags, coal_flags, DIM_CQ_PERIOD_MODE_START_FROM_EQE, 0, virtnet_rx_dim_work, NULL); if (ret) return ret; for (i = 0; i < vi->max_queue_pairs; i++) net_dim_setting(vi->dev, &vi->rq[i].dim, false); return 0; } static void virtnet_free_irq_moder(struct virtnet_info *vi) { if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) return; rtnl_lock(); net_dim_free_irq_moder(vi->dev); rtnl_unlock(); } static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, .ndo_get_stats64 = virtnet_stats, .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, .ndo_bpf = virtnet_xdp, .ndo_xdp_xmit = virtnet_xdp_xmit, .ndo_xsk_wakeup = virtnet_xsk_wakeup, .ndo_features_check = passthru_features_check, .ndo_get_phys_port_name = virtnet_get_phys_port_name, .ndo_set_features = virtnet_set_features, .ndo_tx_timeout = virtnet_tx_timeout, }; static void virtnet_config_changed_work(struct work_struct *work) { struct virtnet_info *vi = container_of(work, struct virtnet_info, config_work); u16 v; if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS, struct virtio_net_config, status, &v) < 0) return; if (v & VIRTIO_NET_S_ANNOUNCE) { netdev_notify_peers(vi->dev); virtnet_ack_link_announce(vi); } /* Ignore unknown (future) status bits */ v &= VIRTIO_NET_S_LINK_UP; if (vi->status == v) return; vi->status = v; if (vi->status & VIRTIO_NET_S_LINK_UP) { virtnet_update_settings(vi); netif_carrier_on(vi->dev); netif_tx_wake_all_queues(vi->dev); } else { netif_carrier_off(vi->dev); netif_tx_stop_all_queues(vi->dev); } } static void virtnet_config_changed(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; schedule_work(&vi->config_work); } static void virtnet_free_queues(struct virtnet_info *vi) { int i; for (i = 0; i < vi->max_queue_pairs; i++) { __netif_napi_del(&vi->rq[i].napi); __netif_napi_del(&vi->sq[i].napi); } /* We called __netif_napi_del(), * we need to respect an RCU grace period before freeing vi->rq */ synchronize_net(); kfree(vi->rq); kfree(vi->sq); kfree(vi->ctrl); } static void _free_receive_bufs(struct virtnet_info *vi) { struct bpf_prog *old_prog; int i; for (i = 0; i < vi->max_queue_pairs; i++) { while (vi->rq[i].pages) __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0); old_prog = rtnl_dereference(vi->rq[i].xdp_prog); RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL); if (old_prog) bpf_prog_put(old_prog); } } static void free_receive_bufs(struct virtnet_info *vi) { rtnl_lock(); _free_receive_bufs(vi); rtnl_unlock(); } static void free_receive_page_frags(struct virtnet_info *vi) { int i; for (i = 0; i < vi->max_queue_pairs; i++) if (vi->rq[i].alloc_frag.page) { if (vi->rq[i].last_dma) virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0); put_page(vi->rq[i].alloc_frag.page); } } static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { struct virtnet_info *vi = vq->vdev->priv; struct send_queue *sq; int i = vq2txq(vq); sq = &vi->sq[i]; switch (virtnet_xmit_ptr_unpack(&buf)) { case VIRTNET_XMIT_TYPE_SKB: case VIRTNET_XMIT_TYPE_SKB_ORPHAN: dev_kfree_skb(buf); break; case VIRTNET_XMIT_TYPE_XDP: xdp_return_frame(buf); break; case VIRTNET_XMIT_TYPE_XSK: xsk_tx_completed(sq->xsk_pool, 1); break; } } static void virtnet_sq_free_unused_buf_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; int i = vq2txq(vq); netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); } static void free_unused_bufs(struct virtnet_info *vi) { void *buf; int i; for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->sq[i].vq; while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) virtnet_sq_free_unused_buf(vq, buf); cond_resched(); } for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->rq[i].vq; while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) virtnet_rq_unmap_free_buf(vq, buf); cond_resched(); } } static void virtnet_del_vqs(struct virtnet_info *vi) { struct virtio_device *vdev = vi->vdev; virtnet_clean_affinity(vi); vdev->config->del_vqs(vdev); virtnet_free_queues(vi); } /* How large should a single buffer be so a queue full of these can fit at * least one full packet? * Logic below assumes the mergeable buffer header is used. */ static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq) { const unsigned int hdr_len = vi->hdr_len; unsigned int rq_size = virtqueue_get_vring_size(vq); unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu; unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len; unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size); return max(max(min_buf_len, hdr_len) - hdr_len, (unsigned int)GOOD_PACKET_LEN); } static int virtnet_find_vqs(struct virtnet_info *vi) { struct virtqueue_info *vqs_info; struct virtqueue **vqs; int ret = -ENOMEM; int total_vqs; bool *ctx; u16 i; /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by * possible control vq. */ total_vqs = vi->max_queue_pairs * 2 + virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); /* Allocate space for find_vqs parameters */ vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); if (!vqs) goto err_vq; vqs_info = kcalloc(total_vqs, sizeof(*vqs_info), GFP_KERNEL); if (!vqs_info) goto err_vqs_info; if (!vi->big_packets || vi->mergeable_rx_bufs) { ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); if (!ctx) goto err_ctx; } else { ctx = NULL; } /* Parameters for control virtqueue, if any */ if (vi->has_cvq) { vqs_info[total_vqs - 1].name = "control"; } /* Allocate/initialize parameters for send/receive virtqueues */ for (i = 0; i < vi->max_queue_pairs; i++) { vqs_info[rxq2vq(i)].callback = skb_recv_done; vqs_info[txq2vq(i)].callback = skb_xmit_done; sprintf(vi->rq[i].name, "input.%u", i); sprintf(vi->sq[i].name, "output.%u", i); vqs_info[rxq2vq(i)].name = vi->rq[i].name; vqs_info[txq2vq(i)].name = vi->sq[i].name; if (ctx) vqs_info[rxq2vq(i)].ctx = true; } ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, vqs_info, NULL); if (ret) goto err_find; if (vi->has_cvq) { vi->cvq = vqs[total_vqs - 1]; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].vq = vqs[rxq2vq(i)]; vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); vi->sq[i].vq = vqs[txq2vq(i)]; } /* run here: ret == 0. */ err_find: kfree(ctx); err_ctx: kfree(vqs_info); err_vqs_info: kfree(vqs); err_vq: return ret; } static int virtnet_alloc_queues(struct virtnet_info *vi) { int i; if (vi->has_cvq) { vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); if (!vi->ctrl) goto err_ctrl; } else { vi->ctrl = NULL; } vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL); if (!vi->sq) goto err_sq; vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL); if (!vi->rq) goto err_rq; INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, i); vi->rq[i].napi.weight = napi_weight; netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); u64_stats_init(&vi->rq[i].stats.syncp); u64_stats_init(&vi->sq[i].stats.syncp); mutex_init(&vi->rq[i].dim_lock); } return 0; err_rq: kfree(vi->sq); err_sq: kfree(vi->ctrl); err_ctrl: return -ENOMEM; } static int init_vqs(struct virtnet_info *vi) { int ret; /* Allocate send & receive queues */ ret = virtnet_alloc_queues(vi); if (ret) goto err; ret = virtnet_find_vqs(vi); if (ret) goto err_free; cpus_read_lock(); virtnet_set_affinity(vi); cpus_read_unlock(); return 0; err_free: virtnet_free_queues(vi); err: return ret; } #ifdef CONFIG_SYSFS static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, char *buf) { struct virtnet_info *vi = netdev_priv(queue->dev); unsigned int queue_index = get_netdev_rx_queue_index(queue); unsigned int headroom = virtnet_get_headroom(vi); unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; struct ewma_pkt_len *avg; BUG_ON(queue_index >= vi->max_queue_pairs); avg = &vi->rq[queue_index].mrg_avg_pkt_len; return sprintf(buf, "%u\n", get_mergeable_buf_len(&vi->rq[queue_index], avg, SKB_DATA_ALIGN(headroom + tailroom))); } static struct rx_queue_attribute mergeable_rx_buffer_size_attribute = __ATTR_RO(mergeable_rx_buffer_size); static struct attribute *virtio_net_mrg_rx_attrs[] = { &mergeable_rx_buffer_size_attribute.attr, NULL }; static const struct attribute_group virtio_net_mrg_rx_group = { .name = "virtio_net", .attrs = virtio_net_mrg_rx_attrs }; #endif static bool virtnet_fail_on_feature(struct virtio_device *vdev, unsigned int fbit, const char *fname, const char *dname) { if (!virtio_has_feature(vdev, fbit)) return false; dev_err(&vdev->dev, "device advertises feature %s but not %s", fname, dname); return true; } #define VIRTNET_FAIL_ON(vdev, fbit, dbit) \ virtnet_fail_on_feature(vdev, fbit, #fbit, dbit) static bool virtnet_validate_features(struct virtio_device *vdev) { if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) && (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_VQ_NOTF_COAL, "VIRTIO_NET_F_CTRL_VQ"))) { return false; } return true; } #define MIN_MTU ETH_MIN_MTU #define MAX_MTU ETH_MAX_MTU static int virtnet_validate(struct virtio_device *vdev) { if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } if (!virtnet_validate_features(vdev)) return -EINVAL; if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { int mtu = virtio_cread16(vdev, offsetof(struct virtio_net_config, mtu)); if (mtu < MIN_MTU) __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU); } if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY) && !virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { dev_warn(&vdev->dev, "device advertises feature VIRTIO_NET_F_STANDBY but not VIRTIO_NET_F_MAC, disabling standby"); __virtio_clear_bit(vdev, VIRTIO_NET_F_STANDBY); } return 0; } static bool virtnet_check_guest_gso(const struct virtnet_info *vi) { return virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) && virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6)); } static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu) { bool guest_gso = virtnet_check_guest_gso(vi); /* If device can receive ANY guest GSO packets, regardless of mtu, * allocate packets of maximum size, otherwise limit it to only * mtu size worth only. */ if (mtu > ETH_DATA_LEN || guest_gso) { vi->big_packets = true; vi->big_packets_num_skbfrags = guest_gso ? MAX_SKB_FRAGS : DIV_ROUND_UP(mtu, PAGE_SIZE); } } #define VIRTIO_NET_HASH_REPORT_MAX_TABLE 10 static enum xdp_rss_hash_type virtnet_xdp_rss_type[VIRTIO_NET_HASH_REPORT_MAX_TABLE] = { [VIRTIO_NET_HASH_REPORT_NONE] = XDP_RSS_TYPE_NONE, [VIRTIO_NET_HASH_REPORT_IPv4] = XDP_RSS_TYPE_L3_IPV4, [VIRTIO_NET_HASH_REPORT_TCPv4] = XDP_RSS_TYPE_L4_IPV4_TCP, [VIRTIO_NET_HASH_REPORT_UDPv4] = XDP_RSS_TYPE_L4_IPV4_UDP, [VIRTIO_NET_HASH_REPORT_IPv6] = XDP_RSS_TYPE_L3_IPV6, [VIRTIO_NET_HASH_REPORT_TCPv6] = XDP_RSS_TYPE_L4_IPV6_TCP, [VIRTIO_NET_HASH_REPORT_UDPv6] = XDP_RSS_TYPE_L4_IPV6_UDP, [VIRTIO_NET_HASH_REPORT_IPv6_EX] = XDP_RSS_TYPE_L3_IPV6_EX, [VIRTIO_NET_HASH_REPORT_TCPv6_EX] = XDP_RSS_TYPE_L4_IPV6_TCP_EX, [VIRTIO_NET_HASH_REPORT_UDPv6_EX] = XDP_RSS_TYPE_L4_IPV6_UDP_EX }; static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, enum xdp_rss_hash_type *rss_type) { const struct xdp_buff *xdp = (void *)_ctx; struct virtio_net_hdr_v1_hash *hdr_hash; struct virtnet_info *vi; u16 hash_report; if (!(xdp->rxq->dev->features & NETIF_F_RXHASH)) return -ENODATA; vi = netdev_priv(xdp->rxq->dev); hdr_hash = (struct virtio_net_hdr_v1_hash *)(xdp->data - vi->hdr_len); hash_report = __le16_to_cpu(hdr_hash->hash_report); if (hash_report >= VIRTIO_NET_HASH_REPORT_MAX_TABLE) hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; *hash = __le32_to_cpu(hdr_hash->hash_value); return 0; } static const struct xdp_metadata_ops virtnet_xdp_metadata_ops = { .xmo_rx_hash = virtnet_xdp_rx_hash, }; static int virtnet_probe(struct virtio_device *vdev) { int i, err = -ENOMEM; struct net_device *dev; struct virtnet_info *vi; u16 max_queue_pairs; int mtu = 0; /* Find if host supports multiqueue/rss virtio_net device */ max_queue_pairs = 1; if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) max_queue_pairs = virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs)); /* We need at least 2 queue's */ if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) max_queue_pairs = 1; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); if (!dev) return -ENOMEM; /* Set up network device as normal. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE | IFF_TX_SKB_NO_LINEAR; dev->netdev_ops = &virtnet_netdev; dev->stat_ops = &virtnet_stat_ops; dev->features = NETIF_F_HIGHDMA; dev->ethtool_ops = &virtnet_ethtool_ops; SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (csum) dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6; } /* Individual feature bits: what can host handle? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) dev->hw_features |= NETIF_F_TSO; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6)) dev->hw_features |= NETIF_F_TSO6; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) dev->hw_features |= NETIF_F_TSO_ECN; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO)) dev->hw_features |= NETIF_F_GSO_UDP_L4; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) { dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; dev->hw_enc_features = dev->hw_features; } if (dev->hw_features & NETIF_F_GSO_UDP_TUNNEL && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM)) { dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; dev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; } dev->features |= NETIF_F_GSO_ROBUST; if (gso) dev->features |= dev->hw_features; /* (!csum && gso) case will be fixed by register_netdev() */ } /* 1. With VIRTIO_NET_F_GUEST_CSUM negotiation, the driver doesn't * need to calculate checksums for partially checksummed packets, * as they're considered valid by the upper layer. * 2. Without VIRTIO_NET_F_GUEST_CSUM negotiation, the driver only * receives fully checksummed packets. The device may assist in * validating these packets' checksums, so the driver won't have to. */ dev->features |= NETIF_F_RXCSUM; if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) dev->features |= NETIF_F_GRO_HW; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) dev->hw_features |= NETIF_F_GRO_HW; dev->vlan_features = dev->features; dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY; /* MTU range: 68 - 65535 */ dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU; /* Configuration may specify what MAC to use. Otherwise random. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { u8 addr[ETH_ALEN]; virtio_cread_bytes(vdev, offsetof(struct virtio_net_config, mac), addr, ETH_ALEN); eth_hw_addr_set(dev, addr); } else { eth_hw_addr_random(dev); dev_info(&vdev->dev, "Assigned random MAC address %pM\n", dev->dev_addr); } /* Set up our device-specific information */ vi = netdev_priv(dev); vi->dev = dev; vi->vdev = vdev; vdev->priv = vi; INIT_WORK(&vi->config_work, virtnet_config_changed_work); INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work); spin_lock_init(&vi->refill_lock); if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { vi->mergeable_rx_bufs = true; dev->xdp_features |= NETDEV_XDP_ACT_RX_SG; } if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) { vi->has_rss = true; vi->rss_indir_table_size = virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); } vi->rss_hdr = devm_kzalloc(&vdev->dev, virtnet_rss_hdr_size(vi), GFP_KERNEL); if (!vi->rss_hdr) { err = -ENOMEM; goto free; } if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); if (vi->rss_key_size > VIRTIO_NET_RSS_MAX_KEY_SIZE) { dev_err(&vdev->dev, "rss_max_key_size=%u exceeds the limit %u.\n", vi->rss_key_size, VIRTIO_NET_RSS_MAX_KEY_SIZE); err = -EINVAL; goto free; } vi->rss_hash_types_supported = virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types)); vi->rss_hash_types_supported &= ~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDP_EX); dev->hw_features |= NETIF_F_RXHASH; dev->xdp_metadata_ops = &virtnet_xdp_metadata_ops; } if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) || virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash_tunnel); else if (vi->has_rss_hash_report) vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash); else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); else vi->hdr_len = sizeof(struct virtio_net_hdr); if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM)) vi->rx_tnl_csum = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO)) vi->rx_tnl = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO)) vi->tx_tnl = true; if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) || virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) vi->any_header_sg = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) vi->has_cvq = true; mutex_init(&vi->cvq_lock); if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { mtu = virtio_cread16(vdev, offsetof(struct virtio_net_config, mtu)); if (mtu < dev->min_mtu) { /* Should never trigger: MTU was previously validated * in virtnet_validate. */ dev_err(&vdev->dev, "device MTU appears to have changed it is now %d < %d", mtu, dev->min_mtu); err = -EINVAL; goto free; } dev->mtu = mtu; dev->max_mtu = mtu; } virtnet_set_big_packets(vi, mtu); if (vi->any_header_sg) dev->needed_headroom = vi->hdr_len; /* Enable multiqueue by default */ if (num_online_cpus() >= max_queue_pairs) vi->curr_queue_pairs = max_queue_pairs; else vi->curr_queue_pairs = num_online_cpus(); vi->max_queue_pairs = max_queue_pairs; /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ err = init_vqs(vi); if (err) goto free; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { vi->intr_coal_rx.max_usecs = 0; vi->intr_coal_tx.max_usecs = 0; vi->intr_coal_rx.max_packets = 0; /* Keep the default values of the coalescing parameters * aligned with the default napi_tx state. */ if (vi->sq[0].napi.weight) vi->intr_coal_tx.max_packets = 1; else vi->intr_coal_tx.max_packets = 0; } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { /* The reason is the same as VIRTIO_NET_F_NOTF_COAL. */ for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight) vi->sq[i].intr_coal.max_packets = 1; err = virtnet_init_irq_moder(vi); if (err) goto free; } #ifdef CONFIG_SYSFS if (vi->mergeable_rx_bufs) dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group; #endif netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); virtnet_init_settings(dev); if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { vi->failover = net_failover_create(vi->dev); if (IS_ERR(vi->failover)) { err = PTR_ERR(vi->failover); goto free_vqs; } } if (vi->has_rss || vi->has_rss_hash_report) virtnet_init_default_rss(vi); enable_rx_mode_work(vi); /* serialize netdev register + virtio_device_ready() with ndo_open() */ rtnl_lock(); err = register_netdevice(dev); if (err) { pr_debug("virtio_net: registering device failed\n"); rtnl_unlock(); goto free_failover; } /* Disable config change notification until ndo_open. */ virtio_config_driver_disable(vi->vdev); virtio_device_ready(vdev); if (vi->has_rss || vi->has_rss_hash_report) { if (!virtnet_commit_rss_command(vi)) { dev_warn(&vdev->dev, "RSS disabled because committing failed.\n"); dev->hw_features &= ~NETIF_F_RXHASH; vi->has_rss_hash_report = false; vi->has_rss = false; } } virtnet_set_queues(vi, vi->curr_queue_pairs); /* a random MAC address has been assigned, notify the device. * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there * because many devices work fine without getting MAC explicitly */ if (!virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { struct scatterlist sg; sg_init_one(&sg, dev->dev_addr, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { pr_debug("virtio_net: setting MAC address failed\n"); rtnl_unlock(); err = -EINVAL; goto free_unregister_netdev; } } if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_DEVICE_STATS)) { struct virtio_net_stats_capabilities *stats_cap __free(kfree) = NULL; struct scatterlist sg; __le64 v; stats_cap = kzalloc(sizeof(*stats_cap), GFP_KERNEL); if (!stats_cap) { rtnl_unlock(); err = -ENOMEM; goto free_unregister_netdev; } sg_init_one(&sg, stats_cap, sizeof(*stats_cap)); if (!virtnet_send_command_reply(vi, VIRTIO_NET_CTRL_STATS, VIRTIO_NET_CTRL_STATS_QUERY, NULL, &sg)) { pr_debug("virtio_net: fail to get stats capability\n"); rtnl_unlock(); err = -EINVAL; goto free_unregister_netdev; } v = stats_cap->supported_stats_types[0]; vi->device_stats_cap = le64_to_cpu(v); } /* Assume link up if device can't report link status, otherwise get link status from config. */ netif_carrier_off(dev); if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { virtio_config_changed(vi->vdev); } else { vi->status = VIRTIO_NET_S_LINK_UP; virtnet_update_settings(vi); netif_carrier_on(dev); } for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) { unsigned int fbit; fbit = virtio_offload_to_feature(guest_offloads[i]); if (virtio_has_feature(vi->vdev, fbit)) set_bit(guest_offloads[i], &vi->guest_offloads); } vi->guest_offloads_capable = vi->guest_offloads; rtnl_unlock(); err = virtnet_cpu_notif_add(vi); if (err) { pr_debug("virtio_net: registering cpu notifier failed\n"); goto free_unregister_netdev; } pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", dev->name, max_queue_pairs); return 0; free_unregister_netdev: unregister_netdev(dev); free_failover: net_failover_destroy(vi->failover); free_vqs: virtio_reset_device(vdev); cancel_delayed_work_sync(&vi->refill); free_receive_page_frags(vi); virtnet_del_vqs(vi); free: free_netdev(dev); return err; } static void remove_vq_common(struct virtnet_info *vi) { int i; virtio_reset_device(vi->vdev); /* Free unused buffers in both send and recv, if any. */ free_unused_bufs(vi); /* * Rule of thumb is netdev_tx_reset_queue() should follow any * skb freeing not followed by netdev_tx_completed_queue() */ for (i = 0; i < vi->max_queue_pairs; i++) netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, i)); free_receive_bufs(vi); free_receive_page_frags(vi); virtnet_del_vqs(vi); } static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; virtnet_cpu_notif_remove(vi); /* Make sure no work handler is accessing the device. */ flush_work(&vi->config_work); disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); virtnet_free_irq_moder(vi); unregister_netdev(vi->dev); net_failover_destroy(vi->failover); remove_vq_common(vi); free_netdev(vi->dev); } static __maybe_unused int virtnet_freeze(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; virtnet_cpu_notif_remove(vi); virtnet_freeze_down(vdev); remove_vq_common(vi); return 0; } static __maybe_unused int virtnet_restore(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; int err; err = virtnet_restore_up(vdev); if (err) return err; virtnet_set_queues(vi, vi->curr_queue_pairs); err = virtnet_cpu_notif_add(vi); if (err) { virtnet_freeze_down(vdev); remove_vq_common(vi); return err; } return 0; } static struct virtio_device_id id_table[] = { { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, { 0 }, }; #define VIRTNET_FEATURES \ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ VIRTIO_NET_F_MAC, \ VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \ VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, \ VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \ VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ VIRTIO_NET_F_CTRL_MAC_ADDR, \ VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \ VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL, \ VIRTIO_NET_F_VQ_NOTF_COAL, \ VIRTIO_NET_F_GUEST_HDRLEN, VIRTIO_NET_F_DEVICE_STATS static unsigned int features[] = { VIRTNET_FEATURES, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, }; static unsigned int features_legacy[] = { VIRTNET_FEATURES, VIRTIO_NET_F_GSO, VIRTIO_F_ANY_LAYOUT, }; static struct virtio_driver virtio_net_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), .driver.name = KBUILD_MODNAME, .id_table = id_table, .validate = virtnet_validate, .probe = virtnet_probe, .remove = virtnet_remove, .config_changed = virtnet_config_changed, #ifdef CONFIG_PM_SLEEP .freeze = virtnet_freeze, .restore = virtnet_restore, #endif }; static __init int virtio_net_driver_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online", virtnet_cpu_online, virtnet_cpu_down_prep); if (ret < 0) goto out; virtionet_online = ret; ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead", NULL, virtnet_cpu_dead); if (ret) goto err_dead; ret = register_virtio_driver(&virtio_net_driver); if (ret) goto err_virtio; return 0; err_virtio: cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); err_dead: cpuhp_remove_multi_state(virtionet_online); out: return ret; } module_init(virtio_net_driver_init); static __exit void virtio_net_driver_exit(void) { unregister_virtio_driver(&virtio_net_driver); cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); cpuhp_remove_multi_state(virtionet_online); } module_exit(virtio_net_driver_exit); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio network driver"); MODULE_LICENSE("GPL");
454 161 5 436 5 161 510 511 74 74 14 1 549 76 289 905 575 1 14 6 1 1 272 14 9 303 542 280 72 39 10 1 7 1 4 41 372 331 46 5 3 511 160 435 120 1 13 12 702 239 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include "disasm.h" #define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) static const char * const func_id_str[] = { __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) }; #undef __BPF_FUNC_STR_FN static const char *__func_get_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; if (cbs && cbs->cb_call) { const char *res; res = cbs->cb_call(cbs->private_data, insn); if (res) return res; } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) snprintf(buff, len, "kernel-function"); return buff; } static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, u64 full_imm, char *buff, size_t len) { if (cbs && cbs->cb_imm) return cbs->cb_imm(cbs->private_data, insn, full_imm); snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); return buff; } const char *func_id_name(int id) { if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else return "unknown"; } const char *const bpf_class_string[8] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", [BPF_ST] = "st", [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", [BPF_DIV >> 4] = "/=", [BPF_OR >> 4] = "|=", [BPF_AND >> 4] = "&=", [BPF_LSH >> 4] = "<<=", [BPF_RSH >> 4] = ">>=", [BPF_NEG >> 4] = "neg", [BPF_MOD >> 4] = "%=", [BPF_XOR >> 4] = "^=", [BPF_MOV >> 4] = "=", [BPF_ARSH >> 4] = "s>>=", [BPF_END >> 4] = "endian", }; static const char *const bpf_alu_sign_string[16] = { [BPF_DIV >> 4] = "s/=", [BPF_MOD >> 4] = "s%=", }; static const char *const bpf_movsx_string[4] = { [0] = "(s8)", [1] = "(s16)", [3] = "(s32)", }; static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", [BPF_OR >> 4] = "or", [BPF_XOR >> 4] = "xor", }; static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", [BPF_B >> 3] = "u8", [BPF_DW >> 3] = "u64", }; static const char *const bpf_ldsx_string[] = { [BPF_W >> 3] = "s32", [BPF_H >> 3] = "s16", [BPF_B >> 3] = "s8", }; static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", [BPF_JLT >> 4] = "<", [BPF_JGE >> 4] = ">=", [BPF_JLE >> 4] = "<=", [BPF_JSET >> 4] = "&", [BPF_JNE >> 4] = "!=", [BPF_JSGT >> 4] = "s>", [BPF_JSLT >> 4] = "s<", [BPF_JSGE >> 4] = "s>=", [BPF_JSLE >> 4] = "s<=", [BPF_CALL >> 4] = "call", [BPF_EXIT >> 4] = "exit", }; static void print_bpf_end_insn(bpf_insn_print_t verbose, void *private_data, const struct bpf_insn *insn) { verbose(private_data, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } static void print_bpf_bswap_insn(bpf_insn_print_t verbose, void *private_data, const struct bpf_insn *insn) { verbose(private_data, "(%02x) r%d = bswap%d r%d\n", insn->code, insn->dst_reg, insn->imm, insn->dst_reg); } static bool is_sdiv_smod(const struct bpf_insn *insn) { return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) && insn->off == 1; } static bool is_movsx(const struct bpf_insn *insn) { return BPF_OP(insn->code) == BPF_MOV && (insn->off == 8 || insn->off == 16 || insn->off == 32); } static bool is_addr_space_cast(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_SPACE_CAST; } /* Special (internal-only) form of mov, used to resolve per-CPU addrs: * dst_reg = src_reg + <percpu_base_off> * BPF_ADDR_PERCPU is used as a special insn->off value. */ #define BPF_ADDR_PERCPU (-1) static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; } void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) { const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) print_bpf_bswap_insn(verbose, cbs->private_data, insn); else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] : bpf_alu_string[BPF_OP(insn->code) >> 4], is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "", class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { verbose(cbs->private_data, "(%02x) %c%d %s %d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] : bpf_alu_string[BPF_OP(insn->code) >> 4], insn->imm); } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == (BPF_ADD | BPF_FETCH) || insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH))) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_CMPXCHG) { verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", insn->code, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_XCHG) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_LOAD_ACQ) { verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_STORE_REL) { verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) == BPF_MEM) { verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { verbose(cbs->private_data, "(%02x) nospec\n", insn->code); } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); } } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, BPF_MODE(insn->code) == BPF_MEM ? bpf_ldst_string[BPF_SIZE(insn->code) >> 3] : bpf_ldsx_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); } else if (BPF_MODE(insn->code) == BPF_IMM && BPF_SIZE(insn->code) == BPF_DW) { /* At this point, we already made sure that the second * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP32 | BPF_JA)) { verbose(cbs->private_data, "(%02x) gotol pc%+d\n", insn->code, insn->imm); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) if %c%d %s %c%d goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { verbose(cbs->private_data, "(%02x) if %c%d %s 0x%x goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } }
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 // SPDX-License-Identifier: GPL-2.0 /* * linux/net/sunrpc/xprtsock.c * * Client-side transport implementation for sockets. * * TCP callback races fixes (C) 1998 Red Hat * TCP send fixes (C) 1998 Red Hat * TCP NFS related read + write fixes * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie> * * Rewrite of larges part of the code in order to stabilize TCP stuff. * Fix behaviour when socket buffer is full. * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> * * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com> * * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005. * <gilles.quillard@bull.net> */ #include <linux/types.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/pagemap.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/un.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprtsock.h> #include <linux/file.h> #ifdef CONFIG_SUNRPC_BACKCHANNEL #include <linux/sunrpc/bc_xprt.h> #endif #include <net/sock.h> #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> #include <net/tls_prot.h> #include <net/handshake.h> #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uio.h> #include <linux/sched/mm.h> #include <trace/events/sock.h> #include <trace/events/sunrpc.h> #include "socklib.h" #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); static void xs_reset_srcport(struct sock_xprt *transport); static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); /* * xprtsock tunables */ static unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; static unsigned int xprt_tcp_slot_table_entries = RPC_MIN_SLOT_TABLE; static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; #define XS_TCP_LINGER_TO (15U * HZ) static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; /* * We can register our own files under /proc/sys/sunrpc by * calling register_sysctl() again. The files in that * directory become the union of all files registered there. * * We simply need to make sure that we don't collide with * someone else's file names! */ static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT; static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static struct ctl_table_header *sunrpc_table_header; static struct xprt_class xs_local_transport; static struct xprt_class xs_udp_transport; static struct xprt_class xs_tcp_transport; static struct xprt_class xs_tcp_tls_transport; static struct xprt_class xs_bc_tcp_transport; /* * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports */ static struct ctl_table xs_tunables_table[] = { { .procname = "udp_slot_table_entries", .data = &xprt_udp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, { .procname = "tcp_slot_table_entries", .data = &xprt_tcp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, { .procname = "tcp_max_slot_table_entries", .data = &xprt_max_tcp_slot_table_entries, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_slot_table_size, .extra2 = &max_tcp_slot_table_limit }, { .procname = "min_resvport", .data = &xprt_min_resvport, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", .data = &xprt_max_resvport, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { .procname = "tcp_fin_timeout", .data = &xs_tcp_fin_timeout, .maxlen = sizeof(xs_tcp_fin_timeout), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* * Wait duration for a reply from the RPC portmapper. */ #define XS_BIND_TO (60U * HZ) /* * Delay if a UDP socket connect error occurs. This is most likely some * kind of resource problem on the local host. */ #define XS_UDP_REEST_TO (2U * HZ) /* * The reestablish timeout allows clients to delay for a bit before attempting * to reconnect to a server that just dropped our connection. * * We implement an exponential backoff when trying to reestablish a TCP * transport connection with the server. Some servers like to drop a TCP * connection when they are overworked, so we start with a short timeout and * increase over time if the server is down or not responding. */ #define XS_TCP_INIT_REEST_TO (3U * HZ) /* * TCP idle timeout; client drops the transport socket if it is idle * for this long. Note that we also timeout UDP sockets to prevent * holding port numbers when there is no RPC traffic. */ #define XS_IDLE_DISC_TO (5U * 60 * HZ) /* * TLS handshake timeout. */ #define XS_TLS_HANDSHAKE_TO (10U * HZ) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS #endif #ifdef RPC_DEBUG_DATA static void xs_pktdump(char *msg, u32 *packet, unsigned int count) { u8 *buf = (u8 *) packet; int j; dprintk("RPC: %s\n", msg); for (j = 0; j < count && j < 128; j += 4) { if (!(j & 31)) { if (j) dprintk("\n"); dprintk("0x%04x ", j); } dprintk("%02x%02x%02x%02x ", buf[j], buf[j+1], buf[j+2], buf[j+3]); } dprintk("\n"); } #else static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) { /* NOP */ } #endif static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) { return (struct rpc_xprt *) sk->sk_user_data; } static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) { return (struct sockaddr *) &xprt->addr; } static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt) { return (struct sockaddr_un *) &xprt->addr; } static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) { return (struct sockaddr_in *) &xprt->addr; } static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt) { return (struct sockaddr_in6 *) &xprt->addr; } static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = xs_addr(xprt); struct sockaddr_in6 *sin6; struct sockaddr_in *sin; struct sockaddr_un *sun; char buf[128]; switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); if (sun->sun_path[0]) { strscpy(buf, sun->sun_path, sizeof(buf)); } else { buf[0] = '@'; strscpy(buf+1, sun->sun_path+1, sizeof(buf)-1); } xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; case AF_INET: (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); sin = xs_addr_in(xprt); snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); break; case AF_INET6: (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); sin6 = xs_addr_in6(xprt); snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); break; default: BUG(); } xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); } static void xs_format_common_peer_ports(struct rpc_xprt *xprt) { struct sockaddr *sap = xs_addr(xprt); char buf[128]; snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); } static void xs_format_peer_addresses(struct rpc_xprt *xprt, const char *protocol, const char *netid) { xprt->address_strings[RPC_DISPLAY_PROTO] = protocol; xprt->address_strings[RPC_DISPLAY_NETID] = netid; xs_format_common_peer_addresses(xprt); xs_format_common_peer_ports(xprt); } static void xs_update_peer_port(struct rpc_xprt *xprt) { kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); kfree(xprt->address_strings[RPC_DISPLAY_PORT]); xs_format_common_peer_ports(xprt); } static void xs_free_peer_addresses(struct rpc_xprt *xprt) { unsigned int i; for (i = 0; i < RPC_DISPLAY_MAX; i++) switch (i) { case RPC_DISPLAY_PROTO: case RPC_DISPLAY_NETID: continue; default: kfree(xprt->address_strings[i]); } } static size_t xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) { size_t i,n; if (!want || !(buf->flags & XDRBUF_SPARSE_PAGES)) return want; n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < n; i++) { if (buf->pages[i]) continue; buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); if (!buf->pages[i]) { i *= PAGE_SIZE; return i > buf->page_base ? i - buf->page_base : 0; } } return want; } static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; switch (content_type) { case 0: break; case TLS_RECORD_TYPE_DATA: /* TLS sets EOR at the end of each application data * record, even though there might be more frames * waiting to be decrypted. */ *msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); ret = (level == TLS_ALERT_LEVEL_FATAL) ? -EACCES : -EAGAIN; break; default: /* discard this record type */ ret = -EAGAIN; } return ret; } static int xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; u8 alert[2]; struct kvec alert_kvec = { .iov_base = alert, .iov_len = sizeof(alert), }; struct msghdr msg = { .msg_flags = *msg_flags, .msg_control = &u, .msg_controllen = sizeof(u), }; int ret; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); if (ret > 0) { if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } return ret; } static ssize_t xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) { ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); ret = sock_recvmsg(sock, msg, flags); /* Handle TLS inband control message lazily */ if (msg->msg_flags & MSG_CTRUNC) { msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); if (ret == 0 || ret == -EIO) ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); } return ret > 0 ? ret + seek : ret; } static ssize_t xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, struct kvec *kvec, size_t count, size_t seek) { iov_iter_kvec(&msg->msg_iter, ITER_DEST, kvec, 1, count); return xs_sock_recvmsg(sock, msg, flags, seek); } static ssize_t xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, struct bio_vec *bvec, unsigned long nr, size_t count, size_t seek) { iov_iter_bvec(&msg->msg_iter, ITER_DEST, bvec, nr, count); return xs_sock_recvmsg(sock, msg, flags, seek); } static ssize_t xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static void xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) { struct bvec_iter bi = { .bi_size = count, }; struct bio_vec bv; bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); for_each_bvec(bv, bvec, bi, bi) flush_dcache_page(bv.bv_page); } #else static inline void xs_flush_bvec(const struct bio_vec *bvec, size_t count, size_t seek) { } #endif static ssize_t xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, struct xdr_buf *buf, size_t count, size_t seek, size_t *read) { size_t want, seek_init = seek, offset = 0; ssize_t ret; want = min_t(size_t, count, buf->head[0].iov_len); if (seek < want) { ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); if (ret <= 0) goto sock_err; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; seek = 0; } else { seek -= want; offset += want; } want = xs_alloc_sparse_pages( buf, min_t(size_t, count - offset, buf->page_len), GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (seek < want) { ret = xs_read_bvec(sock, msg, flags, buf->bvec, xdr_buf_pagecount(buf), want + buf->page_base, seek + buf->page_base); if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); ret -= buf->page_base; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; seek = 0; } else { seek -= want; offset += want; } want = min_t(size_t, count - offset, buf->tail[0].iov_len); if (seek < want) { ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); if (ret <= 0) goto sock_err; offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) goto out; } else if (offset < seek_init) offset = seek_init; ret = -EMSGSIZE; out: *read = offset - seek_init; return ret; sock_err: offset += seek; goto out; } static void xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) { if (!transport->recv.copied) { if (buf->head[0].iov_len >= transport->recv.offset) memcpy(buf->head[0].iov_base, &transport->recv.xid, transport->recv.offset); transport->recv.copied = transport->recv.offset; } } static bool xs_read_stream_request_done(struct sock_xprt *transport) { return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); } static void xs_read_stream_check_eor(struct sock_xprt *transport, struct msghdr *msg) { if (xs_read_stream_request_done(transport)) msg->msg_flags |= MSG_EOR; } static ssize_t xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, int flags, struct rpc_rqst *req) { struct xdr_buf *buf = &req->rq_private_buf; size_t want, read; ssize_t ret; xs_read_header(transport, buf); want = transport->recv.len - transport->recv.offset; if (want != 0) { ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, transport->recv.copied + want, transport->recv.copied, &read); transport->recv.offset += read; transport->recv.copied += read; } if (transport->recv.offset == transport->recv.len) xs_read_stream_check_eor(transport, msg); if (want == 0) return 0; switch (ret) { default: break; case -EFAULT: case -EMSGSIZE: msg->msg_flags |= MSG_TRUNC; return read; case 0: return -ESHUTDOWN; } return ret < 0 ? ret : read; } static size_t xs_read_stream_headersize(bool isfrag) { if (isfrag) return sizeof(__be32); return 3 * sizeof(__be32); } static ssize_t xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, int flags, size_t want, size_t seek) { struct kvec kvec = { .iov_base = &transport->recv.fraghdr, .iov_len = want, }; return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static ssize_t xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) { struct rpc_xprt *xprt = &transport->xprt; struct rpc_rqst *req; ssize_t ret; /* Is this transport associated with the backchannel? */ if (!xprt->bc_serv) return -ESHUTDOWN; /* Look up and lock the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->recv.xid); if (!req) { printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } if (transport->recv.copied && !req->rq_private_buf.len) return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); else req->rq_private_buf.len = transport->recv.copied; return ret; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ static ssize_t xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) { return -ESHUTDOWN; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ static ssize_t xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) { struct rpc_xprt *xprt = &transport->xprt; struct rpc_rqst *req; ssize_t ret = 0; /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } xprt_pin_rqst(req); spin_unlock(&xprt->queue_lock); ret = xs_read_stream_request(transport, msg, flags, req); spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); else req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); return ret; } static ssize_t xs_read_stream(struct sock_xprt *transport, int flags) { struct msghdr msg = { 0 }; size_t want, read = 0; ssize_t ret = 0; if (transport->recv.len == 0) { want = xs_read_stream_headersize(transport->recv.copied != 0); ret = xs_read_stream_header(transport, &msg, flags, want, transport->recv.offset); if (ret <= 0) goto out_err; transport->recv.offset = ret; if (transport->recv.offset != want) return transport->recv.offset; transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & RPC_FRAGMENT_SIZE_MASK; transport->recv.offset -= sizeof(transport->recv.fraghdr); read = ret; } switch (be32_to_cpu(transport->recv.calldir)) { default: msg.msg_flags |= MSG_TRUNC; break; case RPC_CALL: ret = xs_read_stream_call(transport, &msg, flags); break; case RPC_REPLY: ret = xs_read_stream_reply(transport, &msg, flags); } if (msg.msg_flags & MSG_TRUNC) { transport->recv.calldir = cpu_to_be32(-1); transport->recv.copied = -1; } if (ret < 0) goto out_err; read += ret; if (transport->recv.offset < transport->recv.len) { if (!(msg.msg_flags & MSG_TRUNC)) return read; msg.msg_flags = 0; ret = xs_read_discard(transport->sock, &msg, flags, transport->recv.len - transport->recv.offset); if (ret <= 0) goto out_err; transport->recv.offset += ret; read += ret; if (transport->recv.offset != transport->recv.len) return read; } if (xs_read_stream_request_done(transport)) { trace_xs_stream_read_request(transport); transport->recv.copied = 0; } transport->recv.offset = 0; transport->recv.len = 0; return read; out_err: return ret != 0 ? ret : -ESHUTDOWN; } static __poll_t xs_poll_socket(struct sock_xprt *transport) { return transport->sock->ops->poll(transport->file, transport->sock, NULL); } static bool xs_poll_socket_readable(struct sock_xprt *transport) { __poll_t events = xs_poll_socket(transport); return (events & (EPOLLIN | EPOLLRDNORM)) && !(events & EPOLLRDHUP); } static void xs_poll_check_readable(struct sock_xprt *transport) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) return; if (!xs_poll_socket_readable(transport)) return; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } static void xs_stream_data_receive(struct sock_xprt *transport) { size_t read = 0; ssize_t ret = 0; mutex_lock(&transport->recv_mutex); if (transport->sock == NULL) goto out; for (;;) { ret = xs_read_stream(transport, MSG_DONTWAIT); if (ret < 0) break; read += ret; cond_resched(); } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); else if (ret == -EACCES) xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: mutex_unlock(&transport->recv_mutex); trace_xs_stream_read_data(&transport->xprt, ret, read); } static void xs_stream_data_receive_workfn(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, recv_worker); unsigned int pflags = memalloc_nofs_save(); xs_stream_data_receive(transport); memalloc_nofs_restore(pflags); } static void xs_stream_reset_connect(struct sock_xprt *transport) { transport->recv.offset = 0; transport->recv.len = 0; transport->recv.copied = 0; transport->xmit.offset = 0; } static void xs_stream_start_connect(struct sock_xprt *transport) { transport->xprt.stat.connect_count++; transport->xprt.stat.connect_start = jiffies; } #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) /** * xs_nospace - handle transmit was incomplete * @req: pointer to RPC request * @transport: pointer to struct sock_xprt * */ static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport) { struct rpc_xprt *xprt = &transport->xprt; struct sock *sk = transport->inet; int ret = -EAGAIN; trace_rpc_socket_nospace(req, transport); /* Protect against races with write_space */ spin_lock(&xprt->transport_lock); /* Don't race with disconnect */ if (xprt_connected(xprt)) { /* wait for more buffer space */ set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; spin_unlock(&xprt->transport_lock); return ret; } static int xs_sock_nospace(struct rpc_rqst *req) { struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; lock_sock(sk); if (!sock_writeable(sk)) ret = xs_nospace(req, transport); release_sock(sk); return ret; } static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait) { struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; if (vm_wait) return -ENOBUFS; lock_sock(sk); if (!sk_stream_memory_free(sk)) ret = xs_nospace(req, transport); release_sock(sk); return ret; } static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) { return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } static void xs_stream_abort_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (transport->xmit.offset != 0 && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) xprt_force_disconnect(xprt); } /* * Determine if the previous message in the stream was aborted before it * could complete transmission. */ static bool xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) { return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; } /* * Return the stream record marker field for a record of length < 2^31-1 */ static rpc_fraghdr xs_stream_record_marker(struct xdr_buf *xdr) { if (!xdr->len) return 0; return cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); } /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ static int xs_local_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; struct msghdr msg = { .msg_flags = XS_SENDMSG_FLAGS, }; bool vm_wait; unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { xprt_force_disconnect(xprt); return -ENOTCONN; } xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; req->rq_xtime = ktime_get(); status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - transport->xmit.offset, status); if (likely(sent > 0) || status == 0) { transport->xmit.offset += sent; req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; return 0; } status = -EAGAIN; vm_wait = false; } switch (status) { case -EAGAIN: status = xs_stream_nospace(req, vm_wait); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); fallthrough; case -EPIPE: xprt_force_disconnect(xprt); status = -ENOTCONN; } return status; } /** * xs_udp_send_request - write an RPC request to a UDP socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ static int xs_udp_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; struct msghdr msg = { .msg_name = xs_addr(xprt), .msg_namelen = xprt->addrlen, .msg_flags = XS_SENDMSG_FLAGS, }; unsigned int sent; int status; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); if (!xprt_bound(xprt)) return -ENOTCONN; if (!xprt_request_get_cong(xprt, req)) return -EBADSLT; status = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); if (status < 0) return status; req->rq_xtime = ktime_get(); status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) goto process_status; if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (sent > 0 || status == 0) { req->rq_xmit_bytes_sent += sent; if (sent >= req->rq_slen) return 0; /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } process_status: switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; case -EAGAIN: status = xs_sock_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: case -EPIPE: case -ECONNREFUSED: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); } return status; } /** * xs_tcp_send_request - write an RPC request to a TCP socket * @req: pointer to RPC request * * Return values: * 0: The request has been sent * EAGAIN: The socket was blocked, please call again later to * complete the request * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent * * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ static int xs_tcp_send_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; struct msghdr msg = { .msg_flags = XS_SENDMSG_FLAGS, }; bool vm_wait; unsigned int sent; int status; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { if (transport->sock != NULL) kernel_sock_shutdown(transport->sock, SHUT_RDWR); return -ENOTCONN; } if (!transport->inet) return -ENOTCONN; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) xs_tcp_set_socket_timeouts(xprt, transport->sock); xs_set_srcport(transport, transport->sock); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ req->rq_xtime = ktime_get(); tcp_sock_set_cork(transport->inet, true); vm_wait = sk_stream_is_writeable(transport->inet) ? true : false; do { status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ transport->xmit.offset += sent; req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; if (atomic_long_read(&xprt->xmit_queuelen) == 1) tcp_sock_set_cork(transport->inet, false); return 0; } WARN_ON_ONCE(sent == 0 && status == 0); if (sent > 0) vm_wait = false; } while (status == 0); switch (status) { case -ENOTSOCK: status = -ENOTCONN; /* Should we call xs_close() here? */ break; case -EAGAIN: status = xs_stream_nospace(req, vm_wait); break; case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); } return status; } static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) { sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; sk->sk_error_report = transport->old_error_report; } static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = 0; clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) { set_bit(nr, &transport->sock_state); queue_work(xprtiod_workqueue, &transport->error_worker); } static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); xs_sock_reset_state_flags(xprt); smp_mb__after_atomic(); } /** * xs_error_report - callback to handle TCP socket state errors * @sk: socket * * Note: we don't call sock_error() since there may be a rpc_task * using the socket, and so we don't want to clear sk->sk_err. */ static void xs_error_report(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; if (!(xprt = xprt_from_sock(sk))) return; transport = container_of(xprt, struct sock_xprt, xprt); transport->xprt_err = -sk->sk_err; if (transport->xprt_err == 0) return; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -transport->xprt_err); trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err); /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */ smp_mb__before_atomic(); xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); } static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; struct sock *sk = transport->inet; struct rpc_xprt *xprt = &transport->xprt; struct file *filp = transport->file; if (sk == NULL) return; /* * Make sure we're calling this in a context from which it is safe * to call __fput_sync(). In practice that means rpciod and the * system workqueue. */ if (!(current->flags & PF_WQ_WORKER)) { WARN_ON_ONCE(1); set_bit(XPRT_CLOSE_WAIT, &xprt->state); return; } if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); tls_handshake_cancel(sk); kernel_sock_shutdown(sock, SHUT_RDWR); mutex_lock(&transport->recv_mutex); lock_sock(sk); transport->inet = NULL; transport->sock = NULL; transport->file = NULL; sk->sk_user_data = NULL; sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); xs_sock_reset_connection_flags(xprt); /* Reset stream record info */ xs_stream_reset_connect(transport); release_sock(sk); mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); __fput_sync(filp); xprt_disconnect_done(xprt); } /** * xs_close - close a socket * @xprt: transport * * This is used when all requests are complete; ie, no DRC state remains * on the server we want to save. * * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with * xs_reset_transport() zeroing the socket from underneath a writer. */ static void xs_close(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); dprintk("RPC: xs_close xprt %p\n", xprt); if (transport->sock) tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } static void xs_inject_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: injecting transport disconnect on xprt=%p\n", xprt); xprt_disconnect_done(xprt); } static void xs_xprt_free(struct rpc_xprt *xprt) { xs_free_peer_addresses(xprt); xprt_free(xprt); } /** * xs_destroy - prepare to shutdown a transport * @xprt: doomed transport * */ static void xs_destroy(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); dprintk("RPC: xs_destroy xprt %p\n", xprt); cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); cancel_work_sync(&transport->recv_worker); cancel_work_sync(&transport->error_worker); xs_xprt_free(xprt); module_put(THIS_MODULE); } /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport * @sk: socket * @skb: skbuff * */ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, struct sock *sk, struct sk_buff *skb) { struct rpc_task *task; struct rpc_rqst *rovr; int repsize, copied; u32 _xid; __be32 *xp; repsize = skb->len; if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, 0, sizeof(_xid), &_xid); if (xp == NULL) return; /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } spin_lock(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) { struct sk_buff *skb; struct sock *sk; int err; mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; for (;;) { skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (skb == NULL) break; xs_udp_data_read_skb(&transport->xprt, sk, skb); consume_skb(skb); cond_resched(); } xs_poll_check_readable(transport); out: mutex_unlock(&transport->recv_mutex); } static void xs_udp_data_receive_workfn(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, recv_worker); unsigned int pflags = memalloc_nofs_save(); xs_udp_data_receive(transport); memalloc_nofs_restore(pflags); } /** * xs_data_ready - "data ready" callback for sockets * @sk: socket with data to read * */ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; trace_sk_data_ready(sk); xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); trace_xs_data_ready(xprt); transport->old_data_ready(sk); if (test_bit(XPRT_SOCK_IGNORE_RECV, &transport->sock_state)) return; /* Any data means we had a useful conversation, so * then we don't need to delay the next reconnect */ if (xprt->reestablish_timeout) xprt->reestablish_timeout = 0; if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) queue_work(xprtiod_workqueue, &transport->recv_worker); } } /* * Helper function to force a TCP close if the server is sending * junk and/or it has put us in CLOSE_WAIT */ static void xs_tcp_force_close(struct rpc_xprt *xprt) { xprt_force_disconnect(xprt); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * xs_local_state_change - callback to handle AF_LOCAL socket state changes * @sk: socket whose state has changed * */ static void xs_local_state_change(struct sock *sk) { struct rpc_xprt *xprt; struct sock_xprt *transport; if (!(xprt = xprt_from_sock(sk))) return; transport = container_of(xprt, struct sock_xprt, xprt); if (sk->sk_shutdown & SHUTDOWN_MASK) { clear_bit(XPRT_CONNECTED, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } } /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * */ static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; struct sock_xprt *transport; if (!(xprt = xprt_from_sock(sk))) return; dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n", sk->sk_state, xprt_connected(xprt), sock_flag(sk, SOCK_DEAD), sock_flag(sk, SOCK_ZAPPED), sk->sk_shutdown); transport = container_of(xprt, struct sock_xprt, xprt); trace_rpc_socket_state_change(xprt, sk->sk_socket); switch (sk->sk_state) { case TCP_ESTABLISHED: if (!xprt_test_and_set_connected(xprt)) { xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING); } break; case TCP_FIN_WAIT1: /* The client initiated a shutdown of the socket */ xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_atomic(); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); fallthrough; case TCP_CLOSING: /* * If the server closed down the connection, make sure that * we back off before reconnecting */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_atomic(); break; case TCP_CLOSE: if (test_and_clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state)) { xs_reset_srcport(transport); xprt_clear_connecting(xprt); } clear_bit(XPRT_CLOSING, &xprt->state); /* Trigger the socket release */ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } } static void xs_write_space(struct sock *sk) { struct sock_xprt *transport; struct rpc_xprt *xprt; if (!sk->sk_socket) return; clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; transport = container_of(xprt, struct sock_xprt, xprt); if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state)) return; xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE); sk->sk_write_pending--; } /** * xs_udp_write_space - callback invoked when socket buffer space * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ static void xs_udp_write_space(struct sock *sk) { /* from net/core/sock.c:sock_def_write_space */ if (sock_writeable(sk)) xs_write_space(sk); } /** * xs_tcp_write_space - callback invoked when socket buffer space * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ static void xs_tcp_write_space(struct sock *sk) { /* from net/core/stream.c:sk_stream_write_space */ if (sk_stream_is_writeable(sk)) xs_write_space(sk); } static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; if (transport->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2; } if (transport->sndsize) { sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2; sk->sk_write_space(sk); } } /** * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport * @sndsize: requested size of send buffer, in bytes * @rcvsize: requested size of receive buffer, in bytes * * Set socket send and receive buffer size limits. */ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); transport->sndsize = 0; if (sndsize) transport->sndsize = sndsize + 1024; transport->rcvsize = 0; if (rcvsize) transport->rcvsize = rcvsize + 1024; xs_udp_do_set_buffer_size(xprt); } /** * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport * @xprt: controlling transport * @task: task that timed out * * Adjust the congestion window after a retransmit timeout has occurred. */ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) { spin_lock(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); spin_unlock(&xprt->transport_lock); } static int xs_get_random_port(void) { unsigned short min = xprt_min_resvport, max = xprt_max_resvport; unsigned short range; unsigned short rand; if (max < min) return -EADDRINUSE; range = max - min + 1; rand = get_random_u32_below(range); return rand + min; } static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; unsigned short port = 0; if (kernel_getsockname(sock, (struct sockaddr *)&buf) < 0) goto out; switch (buf.ss_family) { case AF_INET6: port = ntohs(((struct sockaddr_in6 *)&buf)->sin6_port); break; case AF_INET: port = ntohs(((struct sockaddr_in *)&buf)->sin_port); } out: return port; } /** * xs_set_port - reset the port number in the remote endpoint address * @xprt: generic transport * @port: new port number * */ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) { dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); rpc_set_port(xs_addr(xprt), port); xs_update_peer_port(xprt); } static void xs_reset_srcport(struct sock_xprt *transport) { transport->srcport = 0; } static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) { if (transport->srcport == 0 && transport->xprt.reuseport) transport->srcport = xs_sock_getport(sock); } static int xs_get_srcport(struct sock_xprt *transport) { int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); return port; } static unsigned short xs_sock_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); unsigned short ret = 0; mutex_lock(&sock->recv_mutex); if (sock->sock) ret = xs_sock_getport(sock->sock); mutex_unlock(&sock->recv_mutex); return ret; } static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); union { struct sockaddr sa; struct sockaddr_storage st; } saddr; int ret = -ENOTCONN; mutex_lock(&sock->recv_mutex); if (sock->sock) { ret = kernel_getsockname(sock->sock, &saddr.sa); if (ret >= 0) ret = snprintf(buf, buflen, "%pISc", &saddr.sa); } mutex_unlock(&sock->recv_mutex); return ret; } static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) transport->srcport = 0; if (!transport->xprt.resvport) return 0; if (port <= xprt_min_resvport || port > xprt_max_resvport) return xprt_max_resvport; return --port; } static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; int port = xs_get_srcport(transport); unsigned short last; /* * If we are asking for any ephemeral port (i.e. port == 0 && * transport->xprt.resvport == 0), don't bind. Let the local * port selection happen implicitly when the socket is used * (for example at connect time). * * This ensures that we can continue to establish TCP * connections even when all local ephemeral ports are already * a part of some TCP connection. This makes no difference * for UDP sockets, but also doesn't harm them. * * If we're asking for any reserved port (i.e. port == 0 && * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ if (port <= 0) return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; break; } last = port; port = xs_next_srcport(transport, port); if (port > last) nloop++; } while (err == -EADDRINUSE && nloop != 2); if (myaddr.ss_family == AF_INET) dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__, &((struct sockaddr_in *)&myaddr)->sin_addr, port, err ? "failed" : "ok", err); else dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__, &((struct sockaddr_in6 *)&myaddr)->sin6_addr, port, err ? "failed" : "ok", err); return err; } /* * We don't support autobind on AF_LOCAL sockets */ static void xs_local_rpcbind(struct rpc_task *task) { xprt_set_bound(task->tk_xprt); } static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) { } #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key xs_key[3]; static struct lock_class_key xs_slock_key[3]; static inline void xs_reclassify_socketu(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", &xs_slock_key[0], "sk_lock-AF_LOCAL-RPC", &xs_key[0]); } static inline void xs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", &xs_slock_key[1], "sk_lock-AF_INET-RPC", &xs_key[1]); } static inline void xs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", &xs_slock_key[2], "sk_lock-AF_INET6-RPC", &xs_key[2]); } static inline void xs_reclassify_socket(int family, struct socket *sock) { if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk))) return; switch (family) { case AF_LOCAL: xs_reclassify_socketu(sock); break; case AF_INET: xs_reclassify_socket4(sock); break; case AF_INET6: xs_reclassify_socket6(sock); break; } } #else static inline void xs_reclassify_socket(int family, struct socket *sock) { } #endif static void xs_dummy_setup_socket(struct work_struct *work) { } static struct socket *xs_create_sock(struct rpc_xprt *xprt, struct sock_xprt *transport, int family, int type, int protocol, bool reuseport) { struct file *filp; struct socket *sock; int err; err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1); if (err < 0) { dprintk("RPC: can't create %d transport socket (%d).\n", protocol, -err); goto out; } xs_reclassify_socket(family, sock); if (reuseport) sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { sock_release(sock); goto out; } if (protocol == IPPROTO_TCP) sk_net_refcnt_upgrade(sock->sk); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) return ERR_CAST(filp); transport->file = filp; return sock; out: return ERR_PTR(err); } static int xs_local_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; sk->sk_use_task_frag = false; xprt_clear_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; release_sock(sk); } xs_stream_start_connect(transport); return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } /** * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint * @transport: socket transport to connect */ static int xs_local_setup_socket(struct sock_xprt *transport) { struct rpc_xprt *xprt = &transport->xprt; struct file *filp; struct socket *sock; int status; status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); if (status < 0) { dprintk("RPC: can't create AF_LOCAL " "transport socket (%d).\n", -status); goto out; } xs_reclassify_socket(AF_LOCAL, sock); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) { status = PTR_ERR(filp); goto out; } transport->file = filp; dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); status = xs_local_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, status); switch (status) { case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; xprt_set_connected(xprt); break; case -ENOBUFS: break; case -ENOENT: dprintk("RPC: xprt %p: socket %s does not exist\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); break; case -ECONNREFUSED: dprintk("RPC: xprt %p: connection refused for %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); break; default: printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n", __func__, -status, xprt->address_strings[RPC_DISPLAY_ADDR]); } out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); return status; } static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; if (transport->file) goto force_disconnect; if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the * filesystem namespace of the process making the rpc * call. Thus we connect synchronously. * * If we want to support asynchronous AF_LOCAL calls, * we'll need to figure out how to pass a namespace to * connect. */ rpc_task_set_rpc_status(task, -ENOTCONN); goto out_wake; } ret = xs_local_setup_socket(transport); if (ret && !RPC_IS_SOFTCONN(task)) msleep_interruptible(15000); return; force_disconnect: xprt_force_disconnect(xprt); out_wake: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); } #if IS_ENABLED(CONFIG_SUNRPC_SWAP) /* * Note that this should be called with XPRT_LOCKED held, or recv_mutex * held, or when we otherwise know that we have exclusive access to the * socket, to guard against races with xs_reset_transport. */ static void xs_set_memalloc(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); /* * If there's no sock, then we have nothing to set. The * reconnecting process will get it for us. */ if (!transport->inet) return; if (atomic_read(&xprt->swapper)) sk_set_memalloc(transport->inet); } /** * xs_enable_swap - Tag this transport as being used for swap. * @xprt: transport to tag * * Take a reference to this transport on behalf of the rpc_clnt, and * optionally mark it for swapping if it wasn't already. */ static int xs_enable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); mutex_lock(&xs->recv_mutex); if (atomic_inc_return(&xprt->swapper) == 1 && xs->inet) sk_set_memalloc(xs->inet); mutex_unlock(&xs->recv_mutex); return 0; } /** * xs_disable_swap - Untag this transport as being used for swap. * @xprt: transport to tag * * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the * swapper refcount goes to 0, untag the socket as a memalloc socket. */ static void xs_disable_swap(struct rpc_xprt *xprt) { struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt); mutex_lock(&xs->recv_mutex); if (atomic_dec_and_test(&xprt->swapper) && xs->inet) sk_clear_memalloc(xs->inet); mutex_unlock(&xs->recv_mutex); } #else static void xs_set_memalloc(struct rpc_xprt *xprt) { } static int xs_enable_swap(struct rpc_xprt *xprt) { return -EINVAL; } static void xs_disable_swap(struct rpc_xprt *xprt) { } #endif static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_use_task_frag = false; xprt_set_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; xs_set_memalloc(xprt); release_sock(sk); } xs_udp_do_set_buffer_size(xprt); xprt->stat.connect_start = jiffies; } static void xs_udp_setup_socket(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; struct socket *sock; int status = -EIO; unsigned int pflags = current->flags; if (atomic_read(&xprt->swapper)) current->flags |= PF_MEMALLOC; sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP, false); if (IS_ERR(sock)) goto out; dprintk("RPC: worker connecting xprt %p via %s to " "%s (port %s)\n", xprt, xprt->address_strings[RPC_DISPLAY_PROTO], xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); xs_udp_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, 0); status = 0; out: xprt_clear_connecting(xprt); xprt_unlock_connect(xprt, transport); xprt_wake_pending_tasks(xprt, status); current_restore_flags(pflags, PF_MEMALLOC); } /** * xs_tcp_shutdown - gracefully shut down a TCP socket * @xprt: transport * * Initiates a graceful shutdown of the TCP socket by calling the * equivalent of shutdown(SHUT_RDWR); */ static void xs_tcp_shutdown(struct rpc_xprt *xprt) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct socket *sock = transport->sock; int skst = transport->inet ? transport->inet->sk_state : TCP_CLOSE; if (sock == NULL) return; if (!xprt->reuseport) { xs_close(xprt); return; } switch (skst) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: case TCP_LAST_ACK: break; case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; default: xs_reset_transport(transport); } } static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct net *net = sock_net(sock->sk); unsigned long connect_timeout; unsigned long syn_retries; unsigned int keepidle; unsigned int keepcnt; unsigned int timeo; unsigned long t; spin_lock(&xprt->transport_lock); keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); keepcnt = xprt->timeout->to_retries + 1; timeo = jiffies_to_msecs(xprt->timeout->to_initval) * (xprt->timeout->to_retries + 1); clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ sock_set_keepalive(sock->sk); tcp_sock_set_keepidle(sock->sk, keepidle); tcp_sock_set_keepintvl(sock->sk, keepidle); tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ tcp_sock_set_user_timeout(sock->sk, timeo); /* Connect timeout */ connect_timeout = max_t(unsigned long, DIV_ROUND_UP(xprt->connect_timeout, HZ), 1); syn_retries = max_t(unsigned long, READ_ONCE(net->ipv4.sysctl_tcp_syn_retries), 1); for (t = 0; t <= syn_retries && (1UL << t) < connect_timeout; t++) ; if (t <= syn_retries) tcp_sock_set_syncnt(sock->sk, t - 1); } static void xs_tcp_do_set_connect_timeout(struct rpc_xprt *xprt, unsigned long connect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct rpc_timeout to; unsigned long initval; memcpy(&to, xprt->timeout, sizeof(to)); /* Arbitrary lower limit */ initval = max_t(unsigned long, connect_timeout, XS_TCP_INIT_REEST_TO); to.to_initval = initval; to.to_maxval = initval; to.to_retries = 0; memcpy(&transport->tcp_timeout, &to, sizeof(transport->tcp_timeout)); xprt->timeout = &transport->tcp_timeout; xprt->connect_timeout = connect_timeout; } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, unsigned long connect_timeout, unsigned long reconnect_timeout) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); spin_lock(&xprt->transport_lock); if (reconnect_timeout < xprt->max_reconnect_timeout) xprt->max_reconnect_timeout = reconnect_timeout; if (connect_timeout < xprt->connect_timeout) xs_tcp_do_set_connect_timeout(xprt, connect_timeout); set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); spin_unlock(&xprt->transport_lock); } static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); if (!transport->inet) { struct sock *sk = sock->sk; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: * Individual applications, which have specific * knowledge about the normal duration of connections, * MAY override this as appropriate. */ if (xs_addr(xprt)->sa_family == PF_INET6) { ip6_sock_set_addr_preferences(sk, IPV6_PREFER_SRC_PUBLIC); } xs_tcp_set_socket_timeouts(xprt, sock); tcp_sock_set_nodelay(sk); lock_sock(sk); xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; sk->sk_use_task_frag = false; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); xprt_clear_connected(xprt); /* Reset to new socket */ transport->sock = sock; transport->inet = sk; release_sock(sk); } if (!xprt_bound(xprt)) return -ENOTCONN; xs_set_memalloc(xprt); xs_stream_start_connect(transport); /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); } /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint * @work: queued work item * * Invoked by a work queue tasklet. */ static void xs_tcp_setup_socket(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct socket *sock = transport->sock; struct rpc_xprt *xprt = &transport->xprt; int status; unsigned int pflags = current->flags; if (atomic_read(&xprt->swapper)) current->flags |= PF_MEMALLOC; if (xprt_connected(xprt)) goto out; if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state) || !sock) { xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { xprt_wake_pending_tasks(xprt, PTR_ERR(sock)); goto out; } } dprintk("RPC: worker connecting xprt %p via %s to " "%s (port %s)\n", xprt, xprt->address_strings[RPC_DISPLAY_PROTO], xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT]); status = xs_tcp_finish_connecting(xprt, sock); trace_rpc_socket_connect(xprt, sock, status); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); switch (status) { case 0: case -EINPROGRESS: /* SYN_SENT! */ set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state); if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; fallthrough; case -EALREADY: goto out_unlock; case -EADDRNOTAVAIL: /* Source port number is unavailable. Try a new one! */ transport->srcport = 0; status = -EAGAIN; break; case -EPERM: /* Happens, for instance, if a BPF program is preventing * the connect. Remap the error so upper layers can better * deal with it. */ status = -ECONNREFUSED; fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. */ case -ECONNREFUSED: case -ECONNRESET: case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: case -ENOTCONN: break; default: printk("%s: connect returned unhandled error %d\n", __func__, status); status = -EAGAIN; } /* xs_tcp_force_close() wakes tasks with a fixed error code. * We need to wake them first to ensure the correct error code. */ xprt_wake_pending_tasks(xprt, status); xs_tcp_force_close(xprt); out: xprt_clear_connecting(xprt); out_unlock: xprt_unlock_connect(xprt, transport); current_restore_flags(pflags, PF_MEMALLOC); } /* * Transfer the connected socket to @upper_transport, then mark that * xprt CONNECTED. */ static int xs_tcp_tls_finish_connecting(struct rpc_xprt *lower_xprt, struct sock_xprt *upper_transport) { struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); struct rpc_xprt *upper_xprt = &upper_transport->xprt; if (!upper_transport->inet) { struct socket *sock = lower_transport->sock; struct sock *sk = sock->sk; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: * Individual applications, which have specific * knowledge about the normal duration of connections, * MAY override this as appropriate. */ if (xs_addr(upper_xprt)->sa_family == PF_INET6) ip6_sock_set_addr_preferences(sk, IPV6_PREFER_SRC_PUBLIC); xs_tcp_set_socket_timeouts(upper_xprt, sock); tcp_sock_set_nodelay(sk); lock_sock(sk); /* @sk is already connected, so it now has the RPC callbacks. * Reach into @lower_transport to save the original ones. */ upper_transport->old_data_ready = lower_transport->old_data_ready; upper_transport->old_state_change = lower_transport->old_state_change; upper_transport->old_write_space = lower_transport->old_write_space; upper_transport->old_error_report = lower_transport->old_error_report; sk->sk_user_data = upper_xprt; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); xprt_clear_connected(upper_xprt); upper_transport->sock = sock; upper_transport->inet = sk; upper_transport->file = lower_transport->file; release_sock(sk); /* Reset lower_transport before shutting down its clnt */ mutex_lock(&lower_transport->recv_mutex); lower_transport->inet = NULL; lower_transport->sock = NULL; lower_transport->file = NULL; xprt_clear_connected(lower_xprt); xs_sock_reset_connection_flags(lower_xprt); xs_stream_reset_connect(lower_transport); mutex_unlock(&lower_transport->recv_mutex); } if (!xprt_bound(upper_xprt)) return -ENOTCONN; xs_set_memalloc(upper_xprt); if (!xprt_test_and_set_connected(upper_xprt)) { upper_xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); xprt_clear_connecting(upper_xprt); upper_xprt->stat.connect_count++; upper_xprt->stat.connect_time += (long)jiffies - upper_xprt->stat.connect_start; xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); } return 0; } /** * xs_tls_handshake_done - TLS handshake completion handler * @data: address of xprt to wake * @status: status of handshake * @peerid: serial number of key containing the remote's identity * */ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) { struct rpc_xprt *lower_xprt = data; struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); switch (status) { case 0: case -EACCES: case -ETIMEDOUT: lower_transport->xprt_err = status; break; default: lower_transport->xprt_err = -EACCES; } complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_parms *xprtsec) { struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); struct tls_handshake_args args = { .ta_sock = lower_transport->sock, .ta_done = xs_tls_handshake_done, .ta_data = xprt_get(lower_xprt), .ta_peername = lower_xprt->servername, }; struct sock *sk = lower_transport->inet; int rc; init_completion(&lower_transport->handshake_done); set_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); lower_transport->xprt_err = -ETIMEDOUT; switch (xprtsec->policy) { case RPC_XPRTSEC_TLS_ANON: rc = tls_client_hello_anon(&args, GFP_KERNEL); if (rc) goto out_put_xprt; break; case RPC_XPRTSEC_TLS_X509: args.ta_my_cert = xprtsec->cert_serial; args.ta_my_privkey = xprtsec->privkey_serial; rc = tls_client_hello_x509(&args, GFP_KERNEL); if (rc) goto out_put_xprt; break; default: rc = -EACCES; goto out_put_xprt; } rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, XS_TLS_HANDSHAKE_TO); if (rc <= 0) { tls_handshake_cancel(sk); if (rc == 0) rc = -ETIMEDOUT; goto out_put_xprt; } rc = lower_transport->xprt_err; out: xs_stream_reset_connect(lower_transport); clear_bit(XPRT_SOCK_IGNORE_RECV, &lower_transport->sock_state); return rc; out_put_xprt: xprt_put(lower_xprt); goto out; } /** * xs_tcp_tls_setup_socket - establish a TLS session on a TCP socket * @work: queued work item * * Invoked by a work queue tasklet. * * For RPC-with-TLS, there is a two-stage connection process. * * The "upper-layer xprt" is visible to the RPC consumer. Once it has * been marked connected, the consumer knows that a TCP connection and * a TLS session have been established. * * A "lower-layer xprt", created in this function, handles the mechanics * of connecting the TCP socket, performing the RPC_AUTH_TLS probe, and * then driving the TLS handshake. Once all that is complete, the upper * layer xprt is marked connected. */ static void xs_tcp_tls_setup_socket(struct work_struct *work) { struct sock_xprt *upper_transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_clnt *upper_clnt = upper_transport->clnt; struct rpc_xprt *upper_xprt = &upper_transport->xprt; struct rpc_create_args args = { .net = upper_xprt->xprt_net, .protocol = upper_xprt->prot, .address = (struct sockaddr *)&upper_xprt->addr, .addrsize = upper_xprt->addrlen, .timeout = upper_clnt->cl_timeout, .servername = upper_xprt->servername, .program = upper_clnt->cl_program, .prognumber = upper_clnt->cl_prog, .version = upper_clnt->cl_vers, .authflavor = RPC_AUTH_TLS, .cred = upper_clnt->cl_cred, .xprtsec = { .policy = RPC_XPRTSEC_NONE, }, .stats = upper_clnt->cl_stats, }; unsigned int pflags = current->flags; struct rpc_clnt *lower_clnt; struct rpc_xprt *lower_xprt; int status; if (atomic_read(&upper_xprt->swapper)) current->flags |= PF_MEMALLOC; xs_stream_start_connect(upper_transport); /* This implicitly sends an RPC_AUTH_TLS probe */ lower_clnt = rpc_create(&args); if (IS_ERR(lower_clnt)) { trace_rpc_tls_unavailable(upper_clnt, upper_xprt); clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); xprt_clear_connecting(upper_xprt); xprt_wake_pending_tasks(upper_xprt, PTR_ERR(lower_clnt)); xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); goto out_unlock; } /* RPC_AUTH_TLS probe was successful. Try a TLS handshake on * the lower xprt. */ rcu_read_lock(); lower_xprt = rcu_dereference(lower_clnt->cl_xprt); rcu_read_unlock(); if (wait_on_bit_lock(&lower_xprt->state, XPRT_LOCKED, TASK_KILLABLE)) goto out_unlock; status = xs_tls_handshake_sync(lower_xprt, &upper_xprt->xprtsec); if (status) { trace_rpc_tls_not_started(upper_clnt, upper_xprt); goto out_close; } status = xs_tcp_tls_finish_connecting(lower_xprt, upper_transport); if (status) goto out_close; xprt_release_write(lower_xprt, NULL); trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); rpc_shutdown_client(lower_clnt); /* Check for ingress data that arrived before the socket's * ->data_ready callback was set up. */ xs_poll_check_readable(upper_transport); out_unlock: current_restore_flags(pflags, PF_MEMALLOC); upper_transport->clnt = NULL; xprt_unlock_connect(upper_xprt, upper_transport); return; out_close: xprt_release_write(lower_xprt, NULL); rpc_shutdown_client(lower_clnt); /* xprt_force_disconnect() wakes tasks with a fixed tk_status code. * Wake them first here to ensure they get our tk_status code. */ xprt_wake_pending_tasks(upper_xprt, status); xs_tcp_force_close(upper_xprt); xprt_clear_connecting(upper_xprt); goto out_unlock; } /** * xs_connect - connect a socket to a remote endpoint * @xprt: pointer to transport structure * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. * * UDP socket connects are synchronous, but we use a work queue anyway * to guarantee that even unprivileged user processes can set up a * socket on a privileged port. * * If a UDP socket connect fails, the delay behavior here prevents * retry floods (hard mounts). */ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned long delay = 0; WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); } else dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); transport->clnt = task->tk_client; queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, delay); } static void xs_wake_disconnect(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state)) xs_tcp_force_close(&transport->xprt); } static void xs_wake_write(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state)) xprt_write_space(&transport->xprt); } static void xs_wake_error(struct sock_xprt *transport) { int sockerr; if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) return; sockerr = xchg(&transport->xprt_err, 0); if (sockerr < 0) { xprt_wake_pending_tasks(&transport->xprt, sockerr); xs_tcp_force_close(&transport->xprt); } } static void xs_wake_pending(struct sock_xprt *transport) { if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state)) xprt_wake_pending_tasks(&transport->xprt, -EAGAIN); } static void xs_error_handle(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, error_worker); xs_wake_disconnect(transport); xs_wake_write(transport); xs_wake_error(transport); xs_wake_pending(transport); } /** * xs_local_print_stats - display AF_LOCAL socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { long idle_time = 0; if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu " "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /** * xs_udp_print_stats - display UDP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %llu %llu " "%lu %llu %llu\n", transport->srcport, xprt->stat.bind_count, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /** * xs_tcp_print_stats - display TCP socket-specific stats * @xprt: rpc_xprt struct containing statistics * @seq: output file * */ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); long idle_time = 0; if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu " "%llu %llu %lu %llu %llu\n", transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u, xprt->stat.max_slots, xprt->stat.sending_u, xprt->stat.pending_u); } /* * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ static int bc_malloc(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", size); return -EINVAL; } page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (!page) return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; rqst->rq_buffer = buf->data; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } /* * Free the space allocated in the bc_alloc routine */ static void bc_free(struct rpc_task *task) { void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } static int bc_sendto(struct rpc_rqst *req) { struct xdr_buf *xdr = &req->rq_snd_buf; struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); struct msghdr msg = { .msg_flags = 0, }; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); unsigned int sent = 0; int err; req->rq_xtime = ktime_get(); err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask()); if (err < 0) return err; err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent); xdr_free_bvec(xdr); if (err < 0 || sent != (xdr->len + sizeof(marker))) return -EAGAIN; return sent; } /** * bc_send_request - Send a backchannel Call on a TCP socket * @req: rpc_rqst containing Call message to be sent * * xpt_mutex ensures @rqstp's whole message is written to the socket * without interruption. * * Return values: * %0 if the message was sent successfully * %ENOTCONN if the message was not sent */ static int bc_send_request(struct rpc_rqst *req) { struct svc_xprt *xprt; int len; /* * Get the server socket associated with this callback xprt */ xprt = req->rq_xprt->bc_xprt; /* * Grab the mutex to serialize data as the connection is shared * with the fore channel */ mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else len = bc_sendto(req); mutex_unlock(&xprt->xpt_mutex); if (len > 0) len = 0; return len; } static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt); xs_xprt_free(xprt); module_put(THIS_MODULE); } static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, }; static const struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .get_srcaddr = xs_sock_srcaddr, .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_rtt, .timer = xs_udp_timer, .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_udp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, }; static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .get_srcaddr = xs_sock_srcaddr, .get_srcport = xs_sock_srcport, .buf_alloc = rpc_malloc, .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .abort_send_request = xs_stream_abort_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, .set_connect_timeout = xs_tcp_set_connect_timeout, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_num_slots = xprt_bc_max_slots, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, #endif }; /* * The rpc_xprt_ops for the server backchannel */ static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .buf_alloc = bc_malloc, .buf_free = bc_free, .send_request = bc_send_request, .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, }; static int xs_init_anyaddr(const int family, struct sockaddr *sap) { static const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; static const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, }; switch (family) { case AF_LOCAL: break; case AF_INET: memcpy(sap, &sin, sizeof(sin)); break; case AF_INET6: memcpy(sap, &sin6, sizeof(sin6)); break; default: dprintk("RPC: %s: Bad address family\n", __func__); return -EAFNOSUPPORT; } return 0; } static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, unsigned int slot_table_size, unsigned int max_slot_table_size) { struct rpc_xprt *xprt; struct sock_xprt *new; if (args->addrlen > sizeof(xprt->addr)) { dprintk("RPC: xs_setup_xprt: address too large\n"); return ERR_PTR(-EBADF); } xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size, max_slot_table_size); if (xprt == NULL) { dprintk("RPC: xs_setup_xprt: couldn't allocate " "rpc_xprt\n"); return ERR_PTR(-ENOMEM); } new = container_of(xprt, struct sock_xprt, xprt); mutex_init(&new->recv_mutex); memcpy(&xprt->addr, args->dstaddr, args->addrlen); xprt->addrlen = args->addrlen; if (args->srcaddr) memcpy(&new->srcaddr, args->srcaddr, args->addrlen); else { int err; err = xs_init_anyaddr(args->dstaddr->sa_family, (struct sockaddr *)&new->srcaddr); if (err != 0) { xprt_free(xprt); return ERR_PTR(err); } } return xprt; } static const struct rpc_timeout xs_local_default_timeout = { .to_initval = 10 * HZ, .to_maxval = 10 * HZ, .to_retries = 2, }; /** * xs_setup_local - Set up transport to use an AF_LOCAL socket * @args: rpc transport creation arguments * * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP */ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) { struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr; struct sock_xprt *transport; struct rpc_xprt *xprt; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_max_tcp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = 0; xprt->xprt_class = &xs_local_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: if (sun->sun_path[0] != '/' && sun->sun_path[0] != '\0') { dprintk("RPC: bad AF_LOCAL address: %s\n", sun->sun_path); ret = ERR_PTR(-EINVAL); goto out_err; } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } dprintk("RPC: set up xprt to %s via AF_LOCAL\n", xprt->address_strings[RPC_DISPLAY_ADDR]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static const struct rpc_timeout xs_udp_default_timeout = { .to_initval = 5 * HZ, .to_maxval = 30 * HZ, .to_increment = 5 * HZ, .to_retries = 5, }; /** * xs_setup_udp - Set up transport to use a UDP socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries, xprt_udp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_UDP; xprt->xprt_class = &xs_udp_transport; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_UDP_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_udp_ops; xprt->timeout = &xs_udp_default_timeout; INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket); switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static const struct rpc_timeout xs_tcp_default_timeout = { .to_initval = 60 * HZ, .to_maxval = 60 * HZ, .to_retries = 2, }; /** * xs_setup_tcp - Set up transport to use a TCP socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; if (args->flags & XPRT_CREATE_INFINITE_SLOTS) max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, max_slot_table_size); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; if (args->reconnect_timeout) xprt->max_reconnect_timeout = args->reconnect_timeout; xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); if (args->connect_timeout) xs_tcp_do_set_connect_timeout(xprt, args->connect_timeout); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } /** * xs_setup_tcp_tls - Set up transport to use a TCP with TLS * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_tcp_tls(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct rpc_xprt *ret; unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries; if (args->flags & XPRT_CREATE_INFINITE_SLOTS) max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, max_slot_table_size); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_WORK(&transport->error_worker, xs_error_handle); switch (args->xprtsec.policy) { case RPC_XPRTSEC_TLS_ANON: case RPC_XPRTSEC_TLS_X509: xprt->xprtsec = args->xprtsec; INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_tls_setup_socket); break; default: ret = ERR_PTR(-EACCES); goto out_err; } switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } if (xprt_bound(xprt)) dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); else dprintk("RPC: set up xprt to %s (autobind) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PROTO]); if (try_module_get(THIS_MODULE)) return xprt; ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } /** * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket * @args: rpc transport creation arguments * */ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) { struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; struct svc_sock *bc_sock; struct rpc_xprt *ret; xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, xprt_tcp_slot_table_entries); if (IS_ERR(xprt)) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; xprt->xprt_class = &xs_bc_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->timeout = &xs_tcp_default_timeout; /* backchannel */ xprt_set_bound(xprt); xprt->bind_timeout = 0; xprt->reestablish_timeout = 0; xprt->idle_timeout = 0; xprt->ops = &bc_tcp_ops; switch (addr->sa_family) { case AF_INET: xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: ret = ERR_PTR(-EAFNOSUPPORT); goto out_err; } dprintk("RPC: set up xprt to %s (port %s) via %s\n", xprt->address_strings[RPC_DISPLAY_ADDR], xprt->address_strings[RPC_DISPLAY_PORT], xprt->address_strings[RPC_DISPLAY_PROTO]); /* * Once we've associated a backchannel xprt with a connection, * we want to keep it around as long as the connection lasts, * in case we need to start using it for a backchannel again; * this reference won't be dropped until bc_xprt is destroyed. */ xprt_get(xprt); args->bc_xprt->xpt_bc_xprt = xprt; xprt->bc_xprt = args->bc_xprt; bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt); transport->sock = bc_sock->sk_sock; transport->inet = bc_sock->sk_sk; /* * Since we don't want connections for the backchannel, we set * the xprt status to connected */ xprt_set_connected(xprt); if (try_module_get(THIS_MODULE)) return xprt; args->bc_xprt->xpt_bc_xprt = NULL; args->bc_xprt->xpt_bc_xps = NULL; xprt_put(xprt); ret = ERR_PTR(-EINVAL); out_err: xs_xprt_free(xprt); return ret; } static struct xprt_class xs_local_transport = { .list = LIST_HEAD_INIT(xs_local_transport.list), .name = "named UNIX socket", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, .netid = { "" }, }; static struct xprt_class xs_udp_transport = { .list = LIST_HEAD_INIT(xs_udp_transport.list), .name = "udp", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { .list = LIST_HEAD_INIT(xs_tcp_transport.list), .name = "tcp", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_tcp_tls_transport = { .list = LIST_HEAD_INIT(xs_tcp_tls_transport.list), .name = "tcp-with-tls", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP_TLS, .setup = xs_setup_tcp_tls, .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list), .name = "tcp NFSv4.1 backchannel", .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, .netid = { "" }, }; /** * init_socket_xprt - set up xprtsock's sysctls, register with RPC client * */ int init_socket_xprt(void) { if (!sunrpc_table_header) sunrpc_table_header = register_sysctl("sunrpc", xs_tunables_table); xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); xprt_register_transport(&xs_tcp_transport); xprt_register_transport(&xs_tcp_tls_transport); xprt_register_transport(&xs_bc_tcp_transport); return 0; } /** * cleanup_socket_xprt - remove xprtsock's sysctls, unregister * */ void cleanup_socket_xprt(void) { if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); xprt_unregister_transport(&xs_tcp_transport); xprt_unregister_transport(&xs_tcp_tls_transport); xprt_unregister_transport(&xs_bc_tcp_transport); } static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; #define param_check_portnr(name, p) \ __param_check(name, p, unsigned int); module_param_named(min_resvport, xprt_min_resvport, portnr, 0644); module_param_named(max_resvport, xprt_max_resvport, portnr, 0644); static int param_set_slot_table_size(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_SLOT_TABLE, RPC_MAX_SLOT_TABLE); } static const struct kernel_param_ops param_ops_slot_table_size = { .set = param_set_slot_table_size, .get = param_get_uint, }; #define param_check_slot_table_size(name, p) \ __param_check(name, p, unsigned int); static int param_set_max_slot_table_size(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, RPC_MIN_SLOT_TABLE, RPC_MAX_SLOT_TABLE_LIMIT); } static const struct kernel_param_ops param_ops_max_slot_table_size = { .set = param_set_max_slot_table_size, .get = param_get_uint, }; #define param_check_max_slot_table_size(name, p) \ __param_check(name, p, unsigned int); module_param_named(tcp_slot_table_entries, xprt_tcp_slot_table_entries, slot_table_size, 0644); module_param_named(tcp_max_slot_table_entries, xprt_max_tcp_slot_table_entries, max_slot_table_size, 0644); module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries, slot_table_size, 0644);
11 15 15 8 1 11 8 11 11 15 1 14 8 2 6 2 11 3 11 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
2054 11 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* Copy ready events to userspace */ int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include "rds.h" #include "tcp.h" DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) ____cacheline_aligned; static const char * const rds_tcp_stat_names[] = { "tcp_data_ready_calls", "tcp_write_space_calls", "tcp_sndbuf_full", "tcp_connect_raced", "tcp_listen_closed_stale", }; unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_tcp_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; if (avail < ARRAY_SIZE(rds_tcp_stat_names)) goto out; for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, ARRAY_SIZE(rds_tcp_stat_names)); out: return ARRAY_SIZE(rds_tcp_stat_names); }
194 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright 2020 NXP */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) { ktime_t mono = ktime_get(); switch (gact->tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, gact->tk_offset); } return KTIME_MAX; } static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; u64 n; base = ns_to_ktime(param->tcfg_basetime); now = gate_get_time(gact); if (ktime_after(base, now)) { *start = base; return; } cycle = param->tcfg_cycletime; n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) { ktime_t expires; expires = hrtimer_get_expires(&gact->hitimer); if (expires == 0) expires = KTIME_MAX; start = min_t(ktime_t, start, expires); hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); } static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) { struct tcf_gate *gact = container_of(timer, struct tcf_gate, hitimer); struct tcf_gate_params *p = &gact->param; struct tcfg_gate_entry *next; ktime_t close_time, now; spin_lock(&gact->tcf_lock); next = gact->next_entry; /* cycle start, clear pending bit, clear total octets */ gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; gact->current_entry_octets = 0; gact->current_max_octets = next->maxoctets; gact->current_close_time = ktime_add_ns(gact->current_close_time, next->interval); close_time = gact->current_close_time; if (list_is_last(&next->list, &p->entries)) next = list_first_entry(&p->entries, struct tcfg_gate_entry, list); else next = list_next_entry(next, list); now = gate_get_time(gact); if (ktime_after(now, close_time)) { ktime_t cycle, base; u64 n; cycle = p->tcfg_cycletime; base = ns_to_ktime(p->tcfg_basetime); n = div64_u64(ktime_sub_ns(now, base), cycle); close_time = ktime_add_ns(base, (n + 1) * cycle); } gact->next_entry = next; hrtimer_set_expires(&gact->hitimer, close_time); spin_unlock(&gact->tcf_lock); return HRTIMER_RESTART; } TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); tcf_action_update_bstats(&gact->common, skb); spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); return action; } if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { spin_unlock(&gact->tcf_lock); goto drop; } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { spin_unlock(&gact->tcf_lock); goto overlimit; } } spin_unlock(&gact->tcf_lock); return action; overlimit: tcf_action_inc_overlimit_qstats(&gact->common); drop: tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, }; static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { [TCA_GATE_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_gate)), [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, [TCA_GATE_FLAGS] = { .type = NLA_U32 }, [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, }; static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, struct netlink_ext_ack *extack) { u32 interval = 0; entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); if (tb[TCA_GATE_ENTRY_INTERVAL]) interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); if (interval == 0) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], -1); return 0; } static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_gate_entry(tb, entry, extack); } static void release_entry_list(struct list_head *entries) { struct tcfg_gate_entry *entry, *e; list_for_each_entry_safe(entry, e, entries, list) { list_del(&entry->list); kfree(entry); } } static int parse_gate_list(struct nlattr *list_attr, struct tcf_gate_params *sched, struct netlink_ext_ack *extack) { struct tcfg_gate_entry *entry; struct nlattr *n; int err, rem; int i = 0; if (!list_attr) return -EINVAL; nla_for_each_nested(n, list_attr, rem) { if (nla_type(n) != TCA_GATE_ONE_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); err = -ENOMEM; goto release_list; } err = parse_gate_entry(n, entry, i, extack); if (err < 0) { kfree(entry); goto release_list; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; release_list: release_entry_list(&sched->entries); return err; } static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, enum tk_offsets tko, s32 clockid, bool do_init) { if (!do_init) { if (basetime == gact->param.tcfg_basetime && tko == gact->tk_offset && clockid == gact->param.tcfg_clockid) return; spin_unlock_bh(&gact->tcf_lock); hrtimer_cancel(&gact->hitimer); spin_lock_bh(&gact->tcf_lock); } gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; u32 gflags = 0; s32 prio = -1; ktime_t start; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); if (err < 0) return err; if (!tb[TCA_GATE_PARMS]) return -EINVAL; if (tb[TCA_GATE_CLOCKID]) { clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); return -EINVAL; } } parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; if (err && bind) return ACT_P_BOUND; if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); if (tb[TCA_GATE_BASE_TIME]) basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); gact = to_gate(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&gact->tcf_lock); p = &gact->param; if (tb[TCA_GATE_CYCLE_TIME]) cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); if (err < 0) goto chain_put; } if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); cycletime = cycle; if (!cycletime) { err = -EINVAL; goto chain_put; } } p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); gate_setup_timer(gact, basetime, tk_offset, clockid, ret == ACT_P_CREATED); p->tcfg_priority = prio; p->tcfg_flags = gflags; gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; gact->next_entry = list_first_entry(&p->entries, struct tcfg_gate_entry, list); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); gate_start_timer(gact, start); spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; chain_put: spin_unlock_bh(&gact->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: /* action is not inserted in any list: it's safe to init hitimer * without taking tcf_lock. */ if (ret == ACT_P_CREATED) gate_setup_timer(gact, gact->param.tcfg_basetime, gact->tk_offset, gact->param.tcfg_clockid, true); tcf_idr_release(*a, bind); return err; } static void tcf_gate_cleanup(struct tc_action *a) { struct tcf_gate *gact = to_gate(a); struct tcf_gate_params *p; p = &gact->param; hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } static int dumping_entry(struct sk_buff *skb, struct tcfg_gate_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) goto nla_put_failure; return nla_nest_end(skb, item); nla_put_failure: nla_nest_cancel(skb, item); return -1; } static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_gate *gact = to_gate(a); struct tc_gate opt = { .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, }; struct tcfg_gate_entry *entry; struct tcf_gate_params *p; struct nlattr *entry_list; struct tcf_t t; spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; p = &gact->param; if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, p->tcfg_basetime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, p->tcfg_cycletime, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, p->tcfg_cycletime_ext, TCA_GATE_PAD)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) goto nla_put_failure; if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) goto nla_put_failure; if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) goto nla_put_failure; entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); if (!entry_list) goto nla_put_failure; list_for_each_entry(entry, &p->entries, list) { if (dumping_entry(skb, entry) < 0) goto nla_put_failure; } nla_nest_end(skb, entry_list); tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) goto nla_put_failure; spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_gate *gact = to_gate(a); struct tcf_t *tm = &gact->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); } static void tcf_gate_entry_destructor(void *priv) { struct action_gate_entry *oe = priv; kfree(oe); } static int tcf_gate_get_entries(struct flow_action_entry *entry, const struct tc_action *act) { entry->gate.entries = tcf_gate_get_list(act); if (!entry->gate.entries) return -EINVAL; entry->destructor = tcf_gate_entry_destructor; entry->destructor_priv = entry->gate.entries; return 0; } static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_GATE; entry->gate.prio = tcf_gate_prio(act); entry->gate.basetime = tcf_gate_basetime(act); entry->gate.cycletime = tcf_gate_cycletime(act); entry->gate.cycletimeext = tcf_gate_cycletimeext(act); entry->gate.num_entries = tcf_gate_num_entries(act); err = tcf_gate_get_entries(entry, act); if (err) return err; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_GATE; } return 0; } static struct tc_action_ops act_gate_ops = { .kind = "gate", .id = TCA_ID_GATE, .owner = THIS_MODULE, .act = tcf_gate_act, .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init gate_init_module(void) { return tcf_register_action(&act_gate_ops, &gate_net_ops); } static void __exit gate_cleanup_module(void) { tcf_unregister_action(&act_gate_ops, &gate_net_ops); } module_init(gate_init_module); module_exit(gate_cleanup_module); MODULE_DESCRIPTION("TC gate action"); MODULE_LICENSE("GPL v2");
21 10 12 10 2 2 12 12 12 12 2 12 12 12 12 12 12 12 12 10 10 12 12 1 11 10 11 10 10 10 12 1 11 4 11 11 4 4 4 12 12 12 12 12 12 12 12 12 12 12 22 1531 453 1521 353 1522 1529 1530 238 355 355 238 2781 1162 1995 432 29 183 453 6 1531 1109 376 394 523 804 804 261 995 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
25064 25067 10662 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> #include <linux/static_call.h> #include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> #include <asm/msr.h> #include <asm/topology.h> #include <asm/uv/uv.h> #include <asm/sev.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); #define KHZ 1000 /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; static struct clocksource_base art_base_clk = { .id = CSID_X86_ART, }; static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); static int __init tsc_early_khz_setup(char *buf) { return kstrtouint(buf, 0, &tsc_early_khz); } early_param("tsc_early_khz", tsc_early_khz_setup); __always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { preempt_disable_notrace(); __cyc2ns_read(data); } __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } /* * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) * ns = cycles * (ns_per_sec / freq) * ns = cycles * (10^9 / (cpu_khz * 10^3)) * ns = cycles * (10^6 / cpu_khz) * * Then we use scaling math (suggested by george@mvista.com) to get: * ns = cycles * (10^6 * SC / cpu_khz) / SC * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div * into a shift. The larger SC is, the more accurate the conversion, but * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication * (64-bit result) can be used. * * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); return ns; } static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; preempt_disable_notrace(); ns = __cycles_2_ns(cyc); preempt_enable_notrace(); return ns; } static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; ns_now = cycles_2_ns(tsc_now); /* * Compute a new multiplier as per the above comment and ensure our * time function is continuous; see the comment near struct * cyc2ns_data. */ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* * cyc2ns_shift is exported via arch_perf_update_userpage() where it is * not expected to be greater than 31 due to the original published * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ if (data.cyc2ns_shift == 32) { data.cyc2ns_shift = 31; data.cyc2ns_mul >>= 1; } data.cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); c2n = per_cpu_ptr(&cyc2ns, cpu); write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; write_seqcount_latch(&c2n->seq); c2n->data[1] = data; write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); if (khz) __set_cyc2ns_scale(khz, cpu, tsc_now); sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* * Initialize cyc2ns for boot cpu */ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { unsigned int cpu, this_cpu = smp_processor_id(); struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); struct cyc2ns_data *data = c2n->data; for_each_possible_cpu(cpu) { if (cpu != this_cpu) { seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; } } } /* * Scheduler clock - returns current time in nanosec units. */ noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ return __cycles_2_ns(tsc_now); } /* * Fall back to jiffies if there's no TSC available: * ( But note that we still use it if the TSC is marked * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform * can achieve it. ) */ /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* * Generate a sched_clock if you already have a TSC value. */ u64 native_sched_clock_from_tsc(u64 tsc) { return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif notrace u64 sched_clock(void) { u64 now; preempt_disable_notrace(); now = sched_clock_noinstr(); preempt_enable_notrace(); return now; } int check_tsc_unstable(void) { return tsc_unstable; } EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { mark_tsc_unstable("boot parameter notsc"); return 1; } #else /* * disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ int __init notsc_setup(char *str) { setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; static int tsc_as_watchdog; static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; if (tsc_as_watchdog) pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", __func__); tsc_as_watchdog = 0; } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; if (!strcmp(str, "watchdog")) { if (no_tsc_watchdog) pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", __func__); else tsc_as_watchdog = 1; } return 1; } __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 #define TSC_DEFAULT_THRESHOLD 0x20000 /* * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; } /* * Calculate the TSC frequency from HPET reference */ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { u64 tmp; if (hpet2 < hpet1) hpet2 += 0x100000000ULL; hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } /* * Calculate the TSC frequency from PMTimer reference */ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) { u64 tmp; if (!pm1 && !pm2) return ULONG_MAX; if (pm2 < pm1) pm2 += (u64)ACPI_PM_OVRRUN; pm2 -= pm1; tmp = pm2 * 1000000000LL; do_div(tmp, PMTMR_TICKS_PER_SEC); do_div(deltatsc, tmp); return (unsigned long) deltatsc; } #define CAL_MS 10 #define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 #define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC * in kHz. * * Return ULONG_MAX on failure to calibrate. */ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; int pitcnt; if (!has_legacy_pic()) { /* * Relies on tsc_early_delay_calibrate() to have given us semi * usable udelay(), wait for the same 50ms we would have with * the PIT loop below. */ udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); return ULONG_MAX; } /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Setup CTC channel 2* for mode 0, (interrupt on terminal * count mode), binary count. Set the latch register to 50ms * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); outb(latch & 0xff, 0x42); outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); pitcnt = 0; tscmax = 0; tscmin = ULONG_MAX; while ((inb(0x61) & 0x20) == 0) { t2 = get_cycles(); delta = t2 - tsc; tsc = t2; if ((unsigned long) delta < tscmin) tscmin = (unsigned int) delta; if ((unsigned long) delta > tscmax) tscmax = (unsigned int) delta; pitcnt++; } /* * Sanity checks: * * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; do_div(delta, ms); return delta; } /* * This reads the current MSB of the PIT counter, and * checks if we are running on sufficiently fast and * non-virtualized hardware. * * Our expectations are: * * - the PIT is running at roughly 1.19MHz * * - each IO is going to take about 1us on real hardware, * but we allow it to be much faster (by a factor of 10) or * _slightly_ slower (ie we allow up to a 2us read+counter * update - anything else implies a unacceptably slow CPU * or PIT for the fast calibration to work. * * - with 256 PIT ticks to read the value, we have 214us to * see the same MSB (and overhead like doing a single TSC * read per MSB value etc). * * - We're doing 2 reads per loop (LSB, MSB), and we expect * them each to take about a microsecond on real hardware. * So we expect a count value of around 100. But we'll be * generous, and accept anything over 50. * * - if the PIT is stuck, and we see *many* more reads, we * return early (and the next caller of pit_expect_msb() * then consider it a failure when they don't see the * next expected value). * * These expectations mean that we know that we have seen the * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { /* Ignore LSB */ inb(0x42); return inb(0x42) == val; } static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; prev_tsc = tsc; tsc = get_cycles(); } *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* * We require _some_ success, but the quality control * will be based on the error terms on the TSC values. */ return count > 5; } /* * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend * more than 50ms on it. */ #define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { int i; u64 tsc, delta; unsigned long d1, d2; if (!has_legacy_pic()) return 0; /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Counter 2, mode 0 (one-shot), binary count * * NOTE! Mode 2 decrements by two (and then the * output is flipped each time, giving the same * final output frequency as a decrement-by-one), * so mode 0 is much better when looking at the * individual counts. */ outb(0xb0, 0x43); /* Start at 0xffff */ outb(0xff, 0x42); outb(0xff, 0x42); /* * The PIT starts counting at the next edge, so we * need to delay for a microsecond. The easiest way * to do that is to just read back the 16-bit counter * once from the PIT. */ pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { if (!pit_expect_msb(0xff-i, &delta, &d2)) break; delta -= tsc; /* * Extrapolate the error and fail fast if the error will * never be below 500 ppm. */ if (i == 1 && d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) return 0; /* * Iterate until the error is less than 500 ppm */ if (d1+d2 >= delta >> 11) continue; /* * Check the PIT one more time to verify that * all TSC reads were stable wrt the PIT. * * This also guarantees serialization of the * last cycle read ('d2') in pit_expect_msb. */ if (!pit_verify_msb(0xfe - i)) break; goto success; } } pr_info("Fast TSC calibration failed\n"); return 0; success: /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement 'i' times, and the * error has shrunk to less than 500 ppm. * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); pr_info("Fast TSC calibration using PIT\n"); return delta; } /** * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; unsigned int crystal_khz; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; /* * Denverton SoCs don't report crystal clock, and also don't support * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz * crystal clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ if (crystal_khz != 0) setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); /* * Some Intel SoCs like Skylake and Kabylake don't report the crystal * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { unsigned int eax_base_mhz, ebx, ecx, edx; cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } if (crystal_khz == 0) return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC /* * The local APIC appears to be fed by the core crystal clock * (which sounds entirely sensible). We can set the global * lapic_timer_period here to avoid having to calibrate the APIC * timer later. */ lapic_timer_period = crystal_khz * 1000 / HZ; #endif return crystal_khz * ebx_numerator / eax_denominator; } static unsigned long cpu_khz_from_cpuid(void) { unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } /* * calibrate cpu using pit, hpet, and ptimer methods. They are available * later in boot after acpi is initialized. */ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes * here: * * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and * load a timeout of 50ms. We read the time right after we * started the timer and wait until the PIT count down reaches * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. */ /* Preset PIT loop values */ latch = CAL_LATCH; ms = CAL_MS; loopmin = CAL_PIT_LOOPS; for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* * Read the start value and the reference count of * hpet/pmtimer when available. Then do the PIT * calibration, which will take at least 50ms, and * read the end value. */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); /* Check the reference deviation */ delta = ((u64) tsc_pit_min) * 100; do_div(delta, tsc_ref_min); /* * If both calibration results are inside a 10% window * then we can be sure, that the calibration * succeeded. We break out of the loop right away. We * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { pr_info("PIT calibration matches %s. %d loops\n", hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } /* * Check whether PIT failed more than once. This * happens in virtualized environments. We need to * give the virtual PC a slightly longer timeframe for * the HPET/PMTIMER to make the result precise. */ if (i == 1 && tsc_pit_min == ULONG_MAX) { latch = CAL2_LATCH; ms = CAL2_MS; loopmin = CAL2_PIT_LOOPS; } } /* * Now check the results. */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ pr_info("using %s reference calibration\n", hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ pr_warn("PIT calibration deviates from %s: %lu %lu\n", hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /** * native_calibrate_cpu_early - can calibrate the cpu early in boot */ unsigned long native_calibrate_cpu_early(void) { unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); if (!fast_calibrate) fast_calibrate = cpu_khz_from_msr(); if (!fast_calibrate) { local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); } return fast_calibrate; } /** * native_calibrate_cpu - calibrate the cpu */ static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); if (!tsc_freq) tsc_freq = pit_hpet_ptimer_calibrate_cpu(); return tsc_freq; } void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); #endif } EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); } /* * Even on processors with invariant TSC, TSC gets reset in some the * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to * arbitrary value (still sync'd across cpu's) during resume from such sleep * states. To cope up with this, recompute the cyc2ns_offset for each cpu so * that sched_clock() continues from the point where it was left off during * suspend. */ void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; int cpu; if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); /* * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) { per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; } local_irq_restore(flags); } #ifdef CONFIG_CPU_FREQ /* * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. */ static unsigned int ref_freq; static unsigned long loops_per_jiffy_ref; static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; if (num_online_cpus() > 1) { mark_tsc_unstable("cpufreq changes on SMP"); return 0; } if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { boot_cpu_data.loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; } static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; } core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ #define ART_MIN_DENOMINATOR (1) /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { unsigned int unused; if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; /* * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, * and the TSC counter resets must not occur asynchronously. */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_async_resets) return; cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); art_base_clk.freq_khz /= KHZ; if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } /* clocksource code */ static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); } /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. * * This sanity check is now done in the core timekeeping code. * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static u64 read_tsc(struct clocksource *cs) { return (u64)rdtsc_ordered(); } static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } static void tsc_cs_tick_stable(struct clocksource *cs) { if (tsc_unstable) return; if (using_native_sched_clock()) sched_clock_tick_stable(); } static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); return 0; } /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* * Must mark VALID_FOR_HRES early such that when we unregister tsc_early * this one will immediately take over. We will only register if TSC has * been found good. */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } bool tsc_clocksource_watchdog_disabled(void) { return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) if (is_geode_lx()) { /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; /* * Disable the clocksource watchdog when the system has: * - TSC running at constant frequency * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ int unsynchronized_tsc(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; if (tsc_clocksource_reliable) return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (topology_max_packages() > 1) return 1; } return 0; } static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. */ static void tsc_refine_calibration_work(struct work_struct *work) { static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ if (tsc_start == ULLONG_MAX) { restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); tsc_start = tsc_read_refs(&ref_start, hpet); schedule_delayed_work(&tsc_irqwork, HZ); return; } tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed */ if (tsc_stop == ULLONG_MAX) goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; if (hpet) freq = calc_hpet_ref(delta, ref_start, ref_stop); else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); /* Will hit this only if tsc_force_recalibrate has been set */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { /* Warn if the deviation exceeds 500 ppm */ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); } pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", hpet ? "HPET" : "PM_TIMER", (unsigned long)freq / 1000, (unsigned long)freq % 1000); return; } /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; tsc_khz = freq; pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); /* Update the sched_clock() rate to match the clocksource one */ for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: if (tsc_unstable) goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) { clocksource_unregister(&clocksource_tsc_early); return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); if (!tsc_force_recalibrate) return 0; } schedule_delayed_work(&tsc_irqwork, 0); return 0; } /* * We use device_initcall here, to ensure we run after the hpet * is fully initialized, which may occur at fs_initcall time. */ device_initcall(init_tsc_clocksource); static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); if (early) { cpu_khz = x86_platform.calibrate_cpu(); if (tsc_early_khz) { tsc_khz = tsc_early_khz; } else { tsc_khz = x86_platform.calibrate_tsc(); clocksource_tsc.freq_khz = tsc_khz; } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } /* * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; if (tsc_khz == 0) return false; pr_info("Detected %lu.%03lu MHz processor\n", (unsigned long)cpu_khz / KHZ, (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", (unsigned long)tsc_khz / KHZ, (unsigned long)tsc_khz % KHZ); } return true; } static unsigned long __init get_loops_per_jiffy(void) { u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; } static void __init tsc_enable_sched_clock(void) { loops_per_jiffy = get_loops_per_jiffy(); use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); static_branch_enable(&__use_tsc); } void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; snp_secure_tsc_init(); if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); } void __init tsc_init(void) { if (!cpu_feature_enabled(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. */ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); return; } if (tsc_clocksource_reliable || no_tsc_watchdog) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); /* * If TSC has constant frequency and TSC is synchronized across * sockets then reuse CPU0 calibration. */ if (constant_tsc && !tsc_unstable) return cpu_data(0).loops_per_jiffy; /* * If TSC has constant frequency and TSC is not synchronized across * sockets and this is not the first CPU in the socket, then reuse * the calibration value of an already online CPU on that socket. * * This assumes that CONSTANT_TSC is consistent for all CPUs in a * socket. */ if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; } #endif
55 663 687 613 669 38 49 26 26 11 38 3 12 4 5 26 16 23 3 5 22 576 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2023 Isovalent */ #ifndef __NET_TCX_H #define __NET_TCX_H #include <linux/bpf.h> #include <linux/bpf_mprog.h> #include <net/sch_generic.h> struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; u32 miniq_active; struct rcu_head rcu; }; struct tcx_link { struct bpf_link link; struct net_device *dev; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) { #ifdef CONFIG_NET_XGRESS skb->tc_at_ingress = ingress; #endif } #ifdef CONFIG_NET_XGRESS static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) { struct bpf_mprog_bundle *bundle = entry->parent; return container_of(bundle, struct tcx_entry, bundle); } static inline struct tcx_link *tcx_link(const struct bpf_link *link) { return container_of(link, struct tcx_link, link); } void tcx_inc(void); void tcx_dec(void); static inline void tcx_entry_sync(void) { /* bpf_mprog_entry got a/b swapped, therefore ensure that * there are no inflight users on the old one anymore. */ synchronize_rcu(); } static inline void tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, bool ingress) { ASSERT_RTNL(); if (ingress) rcu_assign_pointer(dev->tcx_ingress, entry); else rcu_assign_pointer(dev->tcx_egress, entry); } static inline struct bpf_mprog_entry * tcx_entry_fetch(struct net_device *dev, bool ingress) { ASSERT_RTNL(); if (ingress) return rcu_dereference_rtnl(dev->tcx_ingress); else return rcu_dereference_rtnl(dev->tcx_egress); } static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) { struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); return &tcx->bundle.a; } return NULL; } #define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { kfree_rcu(tcx_entry(entry), rcu); } static inline struct bpf_mprog_entry * tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) { struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); *created = false; if (!entry) { entry = tcx_entry_create(); if (!entry) return NULL; *created = true; } return entry; } static inline void tcx_skeys_inc(bool ingress) { tcx_inc(); if (ingress) net_inc_ingress_queue(); else net_inc_egress_queue(); } static inline void tcx_skeys_dec(bool ingress) { if (ingress) net_dec_ingress_queue(); else net_dec_egress_queue(); tcx_dec(); } static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); tcx_entry(entry)->miniq_active++; } static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); tcx_entry(entry)->miniq_active--; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; } static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, int code) { switch (code) { case TCX_PASS: skb->tc_index = qdisc_skb_cb(skb)->tc_classid; fallthrough; case TCX_DROP: case TCX_REDIRECT: return code; case TCX_NEXT: default: return TCX_NEXT; } } #endif /* CONFIG_NET_XGRESS */ #if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); void tcx_uninstall(struct net_device *dev, bool ingress); int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); static inline void dev_tcx_uninstall(struct net_device *dev) { ASSERT_RTNL(); tcx_uninstall(dev, true); tcx_uninstall(dev, false); } #else static inline int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline void dev_tcx_uninstall(struct net_device *dev) { } #endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ #endif /* __NET_TCX_H */
1995 1993 1993 1994 9 154 154 154 1942 1936 95 96 96 708 39 39 145 145 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/file.c - sysfs regular (text) file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/mm.h> #include "sysfs.h" static struct kobject *sysfs_file_kobj(struct kernfs_node *kn) { guard(rcu)(); return rcu_dereference(kn->__parent)->priv; } /* * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { struct kobject *kobj = sysfs_file_kobj(kn); if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } /* * Reads on sysfs are handled through seq_file, which takes care of hairy * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; struct kobject *kobj = sysfs_file_kobj(of->kn); const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; char *buf; if (WARN_ON_ONCE(!ops->show)) return -EINVAL; /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } memset(buf, 0, PAGE_SIZE); count = ops->show(kobj, of->kn->priv, buf); if (count < 0) return count; /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. */ if (count >= (ssize_t)PAGE_SIZE) { printk("fill_read_buffer: %pS returned bad count\n", ops->show); /* Try to struggle along */ count = PAGE_SIZE - 1; } seq_commit(sf, count); return 0; } static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (!count) return 0; if (size) { if (pos >= size) return 0; if (pos + count > size) count = size - pos; } if (!battr->read) return -EIO; return battr->read(of->file, kobj, battr, buf, pos, count); } /* kernfs read callback for regular sysfs files with pre-alloc */ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = sysfs_file_kobj(of->kn); ssize_t len; /* * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); if (len < 0) return len; if (pos) { if (len <= pos) return 0; len -= pos; memmove(buf, buf + pos, len); } return min_t(ssize_t, count, len); } /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = sysfs_file_kobj(of->kn); if (!count) return 0; return ops->store(kobj, of->kn->priv, buf, count); } /* kernfs write callback for bin sysfs files */ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (size) { if (size <= pos) return -EFBIG; count = min_t(ssize_t, count, size - pos); } if (!count) return 0; if (!battr->write) return -EIO; return battr->write(of->file, kobj, battr, buf, pos, count); } static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, struct vm_area_struct *vma) { const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); return battr->mmap(of->file, kobj, battr, vma); } static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, int whence) { const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); if (battr->llseek) return battr->llseek(of->file, kobj, battr, offset, whence); else return generic_file_llseek(of->file, offset, whence); } static int sysfs_kf_bin_open(struct kernfs_open_file *of) { const struct bin_attribute *battr = of->kn->priv; if (battr->f_mapping) of->file->f_mapping = battr->f_mapping(); return 0; } void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { struct kernfs_node *kn = kobj->sd, *tmp; if (kn && dir) kn = kernfs_find_and_get(kn, dir); else kernfs_get(kn); if (kn && attr) { tmp = kernfs_find_and_get(kn, attr); kernfs_put(kn); kn = tmp; } if (kn) { kernfs_notify(kn); kernfs_put(kn); } } EXPORT_SYMBOL_GPL(sysfs_notify); static const struct kernfs_ops sysfs_file_kfops_empty = { }; static const struct kernfs_ops sysfs_file_kfops_ro = { .seq_show = sysfs_kf_seq_show, }; static const struct kernfs_ops sysfs_file_kfops_wo = { .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_file_kfops_rw = { .seq_show = sysfs_kf_seq_show, .write = sysfs_kf_write, }; static const struct kernfs_ops sysfs_prealloc_kfops_ro = { .read = sysfs_kf_read, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_wo = { .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_prealloc_kfops_rw = { .read = sysfs_kf_read, .write = sysfs_kf_write, .prealloc = true, }; static const struct kernfs_ops sysfs_bin_kfops_ro = { .read = sysfs_kf_bin_read, }; static const struct kernfs_ops sysfs_bin_kfops_wo = { .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_rw = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, }; static const struct kernfs_ops sysfs_bin_kfops_mmap = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, .open = sysfs_kf_bin_open, .llseek = sysfs_kf_bin_llseek, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, umode_t mode, kuid_t uid, kgid_t gid, const void *ns) { struct kobject *kobj = parent->priv; const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; struct lock_class_key *key = NULL; const struct kernfs_ops *ops = NULL; struct kernfs_node *kn; /* every kobject with an attribute needs a ktype assigned */ if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) return -EINVAL; if (mode & SYSFS_PREALLOC) { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_prealloc_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_prealloc_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_prealloc_kfops_wo; } else { if (sysfs_ops->show && sysfs_ops->store) ops = &sysfs_file_kfops_rw; else if (sysfs_ops->show) ops = &sysfs_file_kfops_ro; else if (sysfs_ops->store) ops = &sysfs_file_kfops_wo; } if (!ops) ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, PAGE_SIZE, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct bin_attribute *battr, umode_t mode, size_t size, kuid_t uid, kgid_t gid, const void *ns) { const struct attribute *attr = &battr->attr; struct lock_class_key *key = NULL; const struct kernfs_ops *ops; struct kernfs_node *kn; if (battr->mmap) ops = &sysfs_bin_kfops_mmap; else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; else if (battr->read) ops = &sysfs_bin_kfops_ro; else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); return PTR_ERR(kn); } return 0; } /** * sysfs_create_file_ns - create an attribute file for an object with custom ns * @kobj: object we're creating for * @attr: attribute descriptor * @ns: namespace the new file should belong to */ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr) { int err = 0; int i; for (i = 0; ptr[i] && !err; i++) err = sysfs_create_file(kobj, ptr[i]); if (err) while (--i >= 0) sysfs_remove_file(kobj, ptr[i]); return err; } EXPORT_SYMBOL_GPL(sysfs_create_files); /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid, NULL); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); /** * sysfs_chmod_file - update the modified mode value on an object attribute. * @kobj: object we're acting for. * @attr: attribute descriptor. * @mode: file permissions. * */ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { struct kernfs_node *kn; struct iattr newattrs; int rc; kn = kernfs_find_and_get(kobj->sd, attr->name); if (!kn) return -ENOENT; newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; rc = kernfs_setattr(kn, &newattrs); kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); /** * sysfs_break_active_protection - break "active" protection * @kobj: The kernel object @attr is associated with. * @attr: The attribute to break the "active" protection for. * * With sysfs, just like kernfs, deletion of an attribute is postponed until * all active .show() and .store() callbacks have finished unless this function * is called. Hence this function is useful in methods that implement self * deletion. */ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *kn; kobject_get(kobj); kn = kernfs_find_and_get(kobj->sd, attr->name); if (kn) kernfs_break_active_protection(kn); else kobject_put(kobj); return kn; } EXPORT_SYMBOL_GPL(sysfs_break_active_protection); /** * sysfs_unbreak_active_protection - restore "active" protection * @kn: Pointer returned by sysfs_break_active_protection(). * * Undo the effects of sysfs_break_active_protection(). Since this function * calls kernfs_put() on the kernfs node that corresponds to the 'attr' * argument passed to sysfs_break_active_protection() that attribute may have * been removed between the sysfs_break_active_protection() and * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after * this function has returned. */ void sysfs_unbreak_active_protection(struct kernfs_node *kn) { struct kobject *kobj = sysfs_file_kobj(kn); kernfs_unbreak_active_protection(kn); kernfs_put(kn); kobject_put(kobj); } EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection); /** * sysfs_remove_file_ns - remove an object attribute with a custom ns tag * @kobj: object we're acting for * @attr: attribute descriptor * @ns: namespace tag of the file to remove * * Hash the attribute name and namespace tag and kill the victim. */ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { struct kernfs_node *parent = kobj->sd; kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); /** * sysfs_remove_file_self - remove an object attribute from its own method * @kobj: object we're acting for * @attr: attribute descriptor * * See kernfs_remove_self() for details. */ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; bool ret; kn = kernfs_find_and_get(parent, attr->name); if (WARN_ON_ONCE(!kn)) return false; ret = kernfs_remove_self(kn); kernfs_put(kn); return ret; } EXPORT_SYMBOL_GPL(sysfs_remove_file_self); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_files); /** * sysfs_remove_file_from_group - remove an attribute file from a group. * @kobj: object we're acting for. * @attr: attribute descriptor. * @group: group name. */ void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { struct kernfs_node *parent; if (group) { parent = kernfs_find_and_get(kobj->sd, group); } else { parent = kobj->sd; kernfs_get(parent); } if (parent) { kernfs_remove_by_name(parent, attr->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); /** * sysfs_create_bin_file - create binary file for object. * @kobj: object. * @attr: attribute descriptor. */ int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kuid_t uid; kgid_t gid; if (WARN_ON(!kobj || !kobj->sd || !attr)) return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, attr->size, uid, gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); /** * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. */ void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid, kgid_t kgid) { struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; return kernfs_setattr(kn, &newattrs); } /** * sysfs_link_change_owner - change owner of a sysfs file. * @kobj: object of the kernfs_node the symlink is located in. * @targ: object of the kernfs_node the symlink points to. * @name: name of the link. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs symlink entry @name under @kobj and changes * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of * @targ. * * Returns 0 on success or error code on failure. */ int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn = NULL; int error; if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs) return -EINVAL; error = -ENOENT; kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns); if (!kn) goto out; error = -EINVAL; if (kernfs_type(kn) != KERNFS_LINK) goto out; if (kn->symlink.target_kn->priv != targ) goto out; error = internal_change_owner(kn, kuid, kgid); out: kernfs_put(kn); return error; } /** * sysfs_file_change_owner - change owner of a sysfs file. * @kobj: object. * @name: name of the file to change. * @kuid: new owner's kuid * @kgid: new owner's kgid * * This function looks up the sysfs entry @name under @kobj and changes the * ownership to @kuid/@kgid. * * Returns 0 on success or error code on failure. */ int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { struct kernfs_node *kn; int error; if (!name) return -EINVAL; if (!kobj->state_in_sysfs) return -EINVAL; kn = kernfs_find_and_get(kobj->sd, name); if (!kn) return -ENOENT; error = internal_change_owner(kn, kuid, kgid); kernfs_put(kn); return error; } EXPORT_SYMBOL_GPL(sysfs_file_change_owner); /** * sysfs_change_owner - change owner of the given object. * @kobj: object. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Change the owner of the default directory, files, groups, and attributes of * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs * entries for a kobject are added by driver core. In summary, * sysfs_change_owner() takes care of the default directory entry for @kobj, * the default attributes associated with the ktype of @kobj and the default * attributes associated with the ktype of @kobj. * Additional properties not added by driver core have to be changed by the * driver or subsystem which created them. This is similar to how * driver/subsystem specific entries are removed. * * Returns 0 on success or error code on failure. */ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { int error; const struct kobj_type *ktype; if (!kobj->state_in_sysfs) return -EINVAL; /* Change the owner of the kobject itself. */ error = internal_change_owner(kobj->sd, kuid, kgid); if (error) return error; ktype = get_ktype(kobj); if (ktype) { /* * Change owner of the default groups associated with the * ktype of @kobj. */ error = sysfs_groups_change_owner(kobj, ktype->default_groups, kuid, kgid); if (error) return error; } return 0; } EXPORT_SYMBOL_GPL(sysfs_change_owner); /** * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @fmt: format * @...: optional arguments to @format * * * Returns number of characters written to @buf. */ int sysfs_emit(char *buf, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf), "invalid sysfs_emit: buf:%p\n", buf)) return 0; va_start(args, fmt); len = vscnprintf(buf, PAGE_SIZE, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit); /** * sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer. * @buf: start of PAGE_SIZE buffer. * @at: offset in @buf to start write in bytes * @at must be >= 0 && < PAGE_SIZE * @fmt: format * @...: optional arguments to @fmt * * * Returns number of characters written starting at &@buf[@at]. */ int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { va_list args; int len; if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE, "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at)) return 0; va_start(args, fmt); len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args); va_end(args); return len; } EXPORT_SYMBOL_GPL(sysfs_emit_at); /** * sysfs_bin_attr_simple_read - read callback to simply copy from memory. * @file: attribute file which is being read. * @kobj: object to which the attribute belongs. * @attr: attribute descriptor. * @buf: destination buffer. * @off: offset in bytes from which to read. * @count: maximum number of bytes to read. * * Simple ->read() callback for bin_attributes backed by a buffer in memory. * The @private and @size members in struct bin_attribute must be set to the * buffer's location and size before the bin_attribute is created in sysfs. * * Bounds check for @off and @count is done in sysfs_kf_bin_read(). * Negative value check for @off is done in vfs_setpos() and default_llseek(). * * Returns number of bytes written to @buf. */ ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { memcpy(buf, attr->private + off, count); return count; } EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);
10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP virtual clock driver * * Copyright 2021 NXP */ #include <linux/slab.h> #include <linux/hashtable.h> #include "ptp_private.h" #define PTP_VCLOCK_CC_SHIFT 31 #define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) #define PTP_VCLOCK_FADJ_SHIFT 9 #define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL #define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) /* protects vclock_hash addition/deletion */ static DEFINE_SPINLOCK(vclock_hash_lock); static DEFINE_READ_MOSTLY_HASHTABLE(vclock_hash, 8); static void ptp_vclock_hash_add(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_add_head_rcu(&vclock->vclock_hash_node, &vclock_hash[vclock->clock->index % HASH_SIZE(vclock_hash)]); spin_unlock(&vclock_hash_lock); } static void ptp_vclock_hash_del(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_del_init_rcu(&vclock->vclock_hash_node); spin_unlock(&vclock_hash_lock); synchronize_rcu(); } static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_adjtime(&vclock->tc, delta); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_read(&vclock->tc); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; int err; u64 ns; err = pptp->info->getcyclesx64(pptp->info, &pts, sts); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; int err; u64 ns; err = pptp->info->getcrosscycles(pptp->info, xtstamp); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); return 0; } static long ptp_vclock_refresh(struct ptp_clock_info *ptp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct timespec64 ts; ptp_vclock_gettime(&vclock->info, &ts); return PTP_VCLOCK_REFRESH_INTERVAL; } static void ptp_vclock_set_subclass(struct ptp_clock *ptp) { lockdep_set_subclass(&ptp->clock.rwsem, PTP_LOCK_VIRTUAL); } static const struct ptp_clock_info ptp_vclock_info = { .owner = THIS_MODULE, .name = "ptp virtual clock", .max_adj = 500000000, .adjfine = ptp_vclock_adjfine, .adjtime = ptp_vclock_adjtime, .settime64 = ptp_vclock_settime, .do_aux_work = ptp_vclock_refresh, }; static u64 ptp_vclock_read(struct cyclecounter *cc) { struct ptp_vclock *vclock = cc_to_vclock(cc); struct ptp_clock *ptp = vclock->pclock; struct timespec64 ts = {}; ptp->info->getcycles64(ptp->info, &ts); return timespec64_to_ns(&ts); } static const struct cyclecounter ptp_vclock_cc = { .read = ptp_vclock_read, .mask = CYCLECOUNTER_MASK(32), .mult = PTP_VCLOCK_CC_MULT, .shift = PTP_VCLOCK_CC_SHIFT, }; struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) { struct ptp_vclock *vclock; vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); if (!vclock) return NULL; vclock->pclock = pclock; vclock->info = ptp_vclock_info; if (pclock->info->getcyclesx64) vclock->info.gettimex64 = ptp_vclock_gettimex; else vclock->info.gettime64 = ptp_vclock_gettime; if (pclock->info->getcrosscycles) vclock->info.getcrosststamp = ptp_vclock_getcrosststamp; vclock->cc = ptp_vclock_cc; snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", pclock->index); INIT_HLIST_NODE(&vclock->vclock_hash_node); mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { kfree(vclock); return NULL; } ptp_vclock_set_subclass(vclock->clock); timecounter_init(&vclock->tc, &vclock->cc, 0); ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); ptp_vclock_hash_add(vclock); return vclock; } void ptp_vclock_unregister(struct ptp_vclock *vclock) { ptp_vclock_hash_del(vclock); ptp_clock_unregister(vclock->clock); kfree(vclock); } #if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { char name[PTP_CLOCK_NAME_LEN] = ""; struct ptp_clock *ptp; struct device *dev; int num = 0; if (pclock_index < 0) return num; snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); dev = class_find_device_by_name(&ptp_class, name); if (!dev) return num; ptp = dev_get_drvdata(dev); if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { put_device(dev); return num; } *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); if (!(*vclock_index)) goto out; memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); num = ptp->n_vclocks; out: mutex_unlock(&ptp->n_vclocks_mux); put_device(dev); return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; u64 ns; u64 vclock_ns = 0; ns = ktime_to_ns(*hwtstamp); rcu_read_lock(); hlist_for_each_entry_rcu(vclock, &vclock_hash[hash], vclock_hash_node) { if (vclock->clock->index != vclock_index) continue; if (mutex_lock_interruptible(&vclock->lock)) break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); mutex_unlock(&vclock->lock); break; } rcu_read_unlock(); return ns_to_ktime(vclock_ns); } EXPORT_SYMBOL(ptp_convert_timestamp); #endif
6 1 1 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; acct = nf_conn_acct_find(ct); if (!acct) return false; counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) what = div64_u64(bytes, pkts); break; } if (sinfo->count.to >= sinfo->count.from) return what <= sinfo->count.to && what >= sinfo->count.from; else /* inverted */ return what < sinfo->count.to || what > sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } /* * This filter cannot function correctly unless connection tracking * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } return ret; } static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { .name = "connbytes", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); module_exit(connbytes_mt_exit);
1 529 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking helpers. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_helper.h */ #ifndef _NF_CONNTRACK_HELPER_H #define _NF_CONNTRACK_HELPER_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_expect.h> #define NF_NAT_HELPER_PREFIX "ip_nat_" #define NF_NAT_HELPER_NAME(name) NF_NAT_HELPER_PREFIX name #define MODULE_ALIAS_NF_NAT_HELPER(name) \ MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) struct module; enum nf_ct_helper_flags { NF_CT_HELPER_F_USERSPACE = (1 << 0), NF_CT_HELPER_F_CONFIGURED = (1 << 1), }; #define NF_CT_HELPER_NAME_LEN 16 struct nf_conntrack_helper { struct hlist_node hnode; /* Internal use. */ char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */ refcount_t refcnt; struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; /* Tuple of things we will help (compared against server response) */ struct nf_conntrack_tuple tuple; /* Function to call when data passes; return verdict, or -1 to invalidate. */ int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info conntrackinfo); void (*destroy)(struct nf_conn *ct); int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct); int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct); unsigned int expect_class_max; unsigned int flags; /* For user-space helpers: */ unsigned int queue_num; /* length of userspace private data stored in nf_conn_help->data */ u16 data_len; /* name of NAT helper module */ char nat_mod_name[NF_CT_HELPER_NAME_LEN]; }; /* Must be kept in sync with the classes defined by helpers */ #define NF_CT_MAX_EXPECT_CLASSES 4 /* nf_conn feature for connections that have a helper */ struct nf_conn_help { /* Helper. if any */ struct nf_conntrack_helper __rcu *helper; struct hlist_head expectations; /* Current number of expected connections */ u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; /* private helper information. */ char data[32] __aligned(8); }; #define NF_CT_HELPER_BUILD_BUG_ON(structsize) \ BUILD_BUG_ON((structsize) > sizeof_field(struct nf_conn_help, data)) struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module); int nf_conntrack_helper_register(struct nf_conntrack_helper *); void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto); int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, u8 proto, bool nat, struct nf_conntrack_helper **hp); void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) { return nf_ct_ext_find(ct, NF_CT_EXT_HELPER); } static inline void *nfct_help_data(const struct nf_conn *ct) { struct nf_conn_help *help; help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); return (void *)help->data; } int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout); struct nf_ct_helper_expectfn { struct list_head head; const char *name; void (*expectfn)(struct nf_conn *ct, struct nf_conntrack_expect *exp); }; __printf(3,4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name); struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol); extern struct hlist_head *nf_ct_helper_hash; extern unsigned int nf_ct_helper_hsize; struct nf_conntrack_nat_helper { struct list_head list; char mod_name[NF_CT_HELPER_NAME_LEN]; /* module name */ struct module *module; /* pointer to self */ }; #define NF_CT_NAT_HELPER_INIT(name) \ { \ .mod_name = NF_NAT_HELPER_NAME(name), \ .module = THIS_MODULE \ } void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); #endif /*_NF_CONNTRACK_HELPER_H*/
14 337 337 337 511 248 484 512 470 46 52 51 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 // SPDX-License-Identifier: GPL-2.0-only /* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */ #include <linux/kernel.h> #include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/init.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <linux/shmem_fs.h> #include <trace/events/writeback.h> #include "internal.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* * Try to keep balance_dirty_pages() call intervals higher than this many pages * by raising pause time to max_pause when falls below it. */ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) #define RATELIMIT_CALC_SHIFT 10 /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ static long ratelimit_pages = 32; /* The following parameters are exported via /proc/sys/vm */ /* * Start background writeback (via writeback threads) at this percentage */ static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ static int vm_dirty_ratio = 20; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ EXPORT_SYMBOL_GPL(dirty_writeback_interval); /* * The longest time for which data is allowed to remain dirty */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; /* * Length of period for aging writeout fractions of bdis. This is an * arbitrarily chosen number. The longer the period, the slower fractions will * reflect changes in current writeout rate. */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) #ifdef CONFIG_CGROUP_WRITEBACK #define GDTC_INIT(__wb) .wb = (__wb), \ .dom = &global_wb_domain, \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB .dom = &global_wb_domain #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ .dom = mem_cgroup_wb_domain(__wb), \ .wb_completions = &(__wb)->memcg_completions, \ .gdtc = __gdtc static bool mdtc_valid(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return dtc->dom; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return mdtc->gdtc; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return &wb->memcg_completions; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; /* * @wb may already be clean by the time control reaches here and * the total may not include its bw. */ if (this_bw < tot_bw) { if (min) { min *= this_bw; min = div64_ul(min, tot_bw); } if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } } *minp = min; *maxp = max; } #else /* CONFIG_CGROUP_WRITEBACK */ #define GDTC_INIT(__wb) .wb = (__wb), \ .wb_completions = &(__wb)->completions #define GDTC_INIT_NO_WB #define MDTC_INIT(__wb, __gdtc) static bool mdtc_valid(struct dirty_throttle_control *dtc) { return false; } static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) { return &global_wb_domain; } static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) { return NULL; } static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) { return NULL; } static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { *minp = wb->bdi->min_ratio; *maxp = wb->bdi->max_ratio; } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect * lowmem and the ability to uphold the zone's watermarks without * requiring writeback. * * This number of dirtyable pages is the base value of which the * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * * Because the user is allowed to specify the dirty limit globally as * absolute number of bytes, calculating the per-zone dirty limit can * require translating the configured limit into a percentage of * global dirtyable memory first. */ /** * node_dirtyable_memory - number of dirtyable pages in a node * @pgdat: the node * * Return: the node's number of pages potentially available for dirty * page cache. This is the base value for the per-node dirty limits. */ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { unsigned long nr_pages = 0; int z; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; nr_pages += zone_page_state(zone, NR_FREE_PAGES); } /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ nr_pages -= min(nr_pages, pgdat->totalreserve_pages); nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; int i; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; unsigned long nr_pages; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; if (!populated_zone(z)) continue; nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); x += nr_pages; } } /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total); #else return 0; #endif } /** * global_dirtyable_memory - number of globally dirtyable pages * * Return: the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ static unsigned long global_dirtyable_memory(void) { unsigned long x; x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ x -= min(x, totalreserve_pages); x += global_node_page_state(NR_INACTIVE_FILE); x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */ } /** * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain * @dtc: dirty_throttle_control of interest * * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { const unsigned long available_memory = dtc->avail; struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; /* convert ratios to per-PAGE_SIZE for higher precision */ unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; /* gdtc is !NULL iff @dtc is for memcg domain */ if (gdtc) { unsigned long global_avail = gdtc->avail; /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against * globally available memory. As the ratios are in * per-PAGE_SIZE, they can be obtained by dividing bytes by * number of pages. */ if (bytes) ratio = min(DIV_ROUND_UP(bytes, global_avail), PAGE_SIZE); if (bg_bytes) bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; tsk = current; if (rt_or_dl_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } /* * Dirty throttling logic assumes the limits in page units fit into * 32-bits. This gives 16TB dirty limits max which is hopefully enough. */ if (thresh > UINT_MAX) thresh = UINT_MAX; /* This makes sure bg_thresh is within 32-bits as well */ if (bg_thresh >= thresh) bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; /* we should eventually report the domain in the TP */ if (!gdtc) trace_global_dirty_state(bg_thresh, thresh); } /** * global_dirty_limits - background-writeback and dirty-throttling thresholds * @pbackground: out parameter for bg_thresh * @pdirty: out parameter for thresh * * Calculate bg_thresh and thresh for global_wb_domain. See * domain_dirty_limits() for details. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; gdtc.avail = global_dirtyable_memory(); domain_dirty_limits(&gdtc); *pbackground = gdtc.bg_thresh; *pdirty = gdtc.thresh; } /** * node_dirty_limit - maximum number of dirty pages allowed in a node * @pgdat: the node * * Return: the maximum number of dirty pages allowed in a node, based * on the node's dirtyable memory. */ static unsigned long node_dirty_limit(struct pglist_data *pgdat) { unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * node_memory / global_dirtyable_memory(); else dirty = vm_dirty_ratio * node_memory / 100; if (rt_or_dl_task(tsk)) dirty += dirty / 4; /* * Dirty throttling logic assumes the limits in page units fit into * 32-bits. This gives 16TB dirty limits max which is hopefully enough. */ return min_t(unsigned long, dirty, UINT_MAX); } /** * node_dirty_ok - tells whether a node is within its dirty limits * @pgdat: the node to check * * Return: %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ bool node_dirty_ok(struct pglist_data *pgdat) { unsigned long limit = node_dirty_limit(pgdat); unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } #ifdef CONFIG_SYSCTL static int dirty_background_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } static int dirty_background_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) { if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > UINT_MAX) { dirty_background_bytes = old_bytes; return -ERANGE; } dirty_background_ratio = 0; } return ret; } static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { vm_dirty_bytes = 0; writeback_set_ratelimit(); } return ret; } static int dirty_bytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { vm_dirty_bytes = old_bytes; return -ERANGE; } writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } #endif static unsigned long wp_next_time(unsigned long cur_time) { cur_time += VM_COMPLETIONS_PERIOD_LEN; /* 0 has a special meaning... */ if (!cur_time) return 1; return cur_time; } static void wb_domain_writeout_add(struct wb_domain *dom, struct fprop_local_percpu *completions, unsigned int max_prop_frac, long nr) { __fprop_add_percpu_max(&dom->completions, completions, max_prop_frac, nr); /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ dom->period_time = wp_next_time(jiffies); mod_timer(&dom->period_timer, dom->period_time); } } /* * Increment @wb's writeout completion count and the global writeout * completion count. Called from __folio_end_writeback(). */ static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) { struct wb_domain *cgdom; wb_stat_mod(wb, WB_WRITTEN, nr); wb_domain_writeout_add(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac, nr); cgdom = mem_cgroup_wb_domain(wb); if (cgdom) wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), wb->bdi->max_prop_frac, nr); } void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); __wb_writeout_add(wb, 1); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ static void writeout_period(struct timer_list *t) { struct wb_domain *dom = timer_container_of(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; if (fprop_new_period(&dom->completions, miss_periods + 1)) { dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ dom->period_time = 0; } } int wb_domain_init(struct wb_domain *dom, gfp_t gfp) { memset(dom, 0, sizeof(*dom)); spin_lock_init(&dom->lock); timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; return fprop_global_init(&dom->completions, gfp); } #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom) { timer_delete_sync(&dom->period_timer); fprop_global_destroy(&dom->completions); } #endif /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. */ static unsigned int bdi_min_ratio; static int bdi_check_pages_limit(unsigned long pages) { unsigned long max_dirty_pages = global_dirtyable_memory(); if (pages > max_dirty_pages) return -EINVAL; return 0; } static unsigned long bdi_ratio_from_pages(unsigned long pages) { unsigned long background_thresh; unsigned long dirty_thresh; unsigned long ratio; global_dirty_limits(&background_thresh, &dirty_thresh); if (!dirty_thresh) return -EINVAL; ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); return ratio; } static u64 bdi_get_bytes(unsigned int ratio) { unsigned long background_thresh; unsigned long dirty_thresh; u64 bytes; global_dirty_limits(&background_thresh, &dirty_thresh); bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; return bytes; } static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; if (min_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { if (min_ratio < bdi->min_ratio) { delta = bdi->min_ratio - min_ratio; bdi_min_ratio -= delta; bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { ret = -EINVAL; } } } spin_unlock_bh(&bdi_lock); return ret; } static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / (100 * BDI_RATIO_SCALE); } spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio); } int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); } EXPORT_SYMBOL(bdi_set_max_ratio); u64 bdi_get_min_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->min_ratio); } int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) { int ret; unsigned long pages = min_bytes >> PAGE_SHIFT; long min_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; min_ratio = bdi_ratio_from_pages(pages); if (min_ratio < 0) return min_ratio; return __bdi_set_min_ratio(bdi, min_ratio); } u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); } int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) { int ret; unsigned long pages = max_bytes >> PAGE_SHIFT; long max_ratio; ret = bdi_check_pages_limit(pages); if (ret) return ret; max_ratio = bdi_ratio_from_pages(pages); if (max_ratio < 0) return max_ratio; return __bdi_set_max_ratio(bdi, max_ratio); } int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) return -EINVAL; spin_lock_bh(&bdi_lock); if (strict_limit) bdi->capabilities |= BDI_CAP_STRICTLIMIT; else bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; spin_unlock_bh(&bdi_lock); return 0; } static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { return (thresh + bg_thresh) / 2; } static unsigned long hard_dirty_limit(struct wb_domain *dom, unsigned long thresh) { return max(thresh, dom->dirty_limit); } /* * Memory which can be further allocated to a memcg domain is capped by * system-wide clean memory excluding the amount being used in the domain. */ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, unsigned long filepages, unsigned long headroom) { struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); unsigned long clean = filepages - min(filepages, mdtc->dirty); unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); unsigned long other_clean = global_clean - min(global_clean, clean); mdtc->avail = filepages + min(headroom, other_clean); } static inline bool dtc_is_global(struct dirty_throttle_control *dtc) { return mdtc_gdtc(dtc) == NULL; } /* * Dirty background will ignore pages being written as we're trying to * decide whether to put more under writeback. */ static void domain_dirty_avail(struct dirty_throttle_control *dtc, bool include_writeback) { if (dtc_is_global(dtc)) { dtc->avail = global_dirtyable_memory(); dtc->dirty = global_node_page_state(NR_FILE_DIRTY); if (include_writeback) dtc->dirty += global_node_page_state(NR_WRITEBACK); } else { unsigned long filepages = 0, headroom = 0, writeback = 0; mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, &writeback); if (include_writeback) dtc->dirty += writeback; mdtc_calc_avail(dtc, filepages, headroom); } } /** * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc * * Note that balance_dirty_pages() will only seriously take dirty throttling * threshold as a hard limit when sleeping max_pause per page is not enough * to keep the dirty pages under control. For example, when the device is * completely stalled due to some error conditions, or when there are 1000 * dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. For dirty throttling limit, the term * "dirty" in the context of dirty balancing includes all PG_dirty and * PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); struct bdi_writeback *wb = dtc->wb; u64 wb_thresh; u64 wb_max_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* * Calculate this wb's share of the thresh ratio. */ fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); /* * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ if (thresh > dtc->dirty) { if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); else wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); } wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); if (wb_thresh > wb_max_thresh) wb_thresh = wb_max_thresh; return wb_thresh; } unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; domain_dirty_avail(&gdtc, true); domain_dirty_avail(&mdtc, true); domain_dirty_limits(&mdtc); return __wb_calc_thresh(&mdtc, mdtc.thresh); } /* * setpoint - dirty 3 * f(dirty) := 1.0 + (----------------) * limit - setpoint * * it's a 3rd order polynomial that subjects to * * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast * (2) f(setpoint) = 1.0 => the balance point * (3) f(limit) = 0 => the hard limit * (4) df/dx <= 0 => negative feedback control * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, (limit - setpoint) | 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; pos_ratio += 1 << RATELIMIT_CALC_SHIFT; return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); } /* * Dirty position control. * * (o) global/bdi setpoints * * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. * * pos_ratio = 1 << RATELIMIT_CALC_SHIFT * * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * * if (wb_dirty < wb_setpoint) scale up pos_ratio * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * * (o) global control line * * ^ pos_ratio * | * | |<===== global dirty control scope ======>| * 2.0 * * * * * * * * | .* * | . * * | . * * | . * * | . * * | . * * 1.0 ................................* * | . . * * | . . * * | . . * * | . . * * | . . * * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * * (o) wb control line * * ^ pos_ratio * | * | * * | * * | * * | * * | * |<=========== span ============>| * 1.0 .......................* * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * | . * * 1/4 ...............................................* * * * * * * * * * * * * | . . * | . . * | . . * 0 +----------------------.-------------------------------.-------------> * wb_setpoint^ x_intercept^ * * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD * card's wb_dirty may rush to many times higher than wb_setpoint. * - the wb dirty thresh drops quickly due to change of JBOD workload */ static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; dtc->pos_ratio = 0; if (unlikely(dtc->dirty >= limit)) return; /* * global setpoint * * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For * such filesystems balance_dirty_pages always checks wb counters * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. * * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; if (dtc->wb_dirty >= wb_thresh) return; wb_setpoint = dirty_freerun_ceiling(wb_thresh, dtc->wb_bg_thresh); if (wb_setpoint == 0 || wb_setpoint == wb_thresh) return; wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, wb_thresh); /* * Typically, for strictlimit case, wb_setpoint << setpoint * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other * wb's) while given strictlimit wb is below limit. * * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); return; } /* * We have computed basic pos_ratio above based on global situation. If * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* * wb setpoint * * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * * x_intercept - wb_dirty * := -------------------------- * x_intercept - wb_setpoint * * The main wb control line is a linear function that subjects to * * (1) f(wb_setpoint) = 1.0 * (2) k = - 1 / (8 * write_bw) (in single wb case) * or equally: x_intercept = wb_setpoint + 8 * write_bw * * For single wb case, the dirty pages are observed to fluctuate * regularly within range * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); wb_setpoint = setpoint * (u64)x >> 16; /* * Use span=(8*write_bw) in single wb case as indicated by * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * * wb_thresh thresh - wb_thresh * span = --------- * (8 * write_bw) + ------------------ * wb_thresh * thresh thresh */ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; x_intercept = wb_setpoint + span; if (dtc->wb_dirty < x_intercept - span / 4) { pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept, dtc->wb_dirty); else pos_ratio *= 8; } dtc->pos_ratio = pos_ratio; } static void wb_update_write_bandwidth(struct bdi_writeback *wb, unsigned long elapsed, unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); unsigned long avg = wb->avg_write_bandwidth; unsigned long old = wb->write_bandwidth; u64 bw; /* * bw = written * HZ / elapsed * * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period * * @written may have decreased due to folio_redirty_for_writepage(). * Avoid underflowing @bw calculation. */ bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { bw = div64_ul(bw, elapsed); avg = bw; goto out; } bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* * one more level of smoothing, for filtering out sudden spikes */ if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3; out: /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ avg = max(avg, 1LU); if (wb_has_dirty_io(wb)) { long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta, &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) { struct wb_domain *dom = dtc_dom(dtc); unsigned long thresh = dtc->thresh; unsigned long limit = dom->dirty_limit; /* * Follow up in one step. */ if (limit < thresh) { limit = thresh; goto update; } /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: dom->dirty_limit = limit; } static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; spin_lock(&dom->lock); if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { update_dirty_limit(dtc); dom->dirty_limit_tstamp = now; } spin_unlock(&dom->lock); } /* * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long dirtied, unsigned long elapsed) { struct bdi_writeback *wb = dtc->wb; unsigned long dirty = dtc->dirty; unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; unsigned long write_bw = wb->avg_write_bandwidth; unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * * Note that the expanded form is not a pure rate feedback: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) * but also takes pos_ratio into account: * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) * * (1) is not realistic because pos_ratio also takes part in balancing * the dirty rate. Consider the state * pos_ratio = 0.5 (3) * rate = 2 * (write_bw / N) (4) * If (1) is used, it will stuck in that state! Because each dd will * be throttled at * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) * yielding * dirty_rate = N * task_ratelimit = write_bw (6) * put (6) into (1) we get * rate_(i+1) = rate_(i) (7) * * So we end up using (2) to always keep * rate_(i+1) ~= (write_bw / N) (8) * regardless of the value of pos_ratio. As long as (8) is satisfied, * pos_ratio is able to drive itself to 1.0, which is not only where * the dirty count meet the setpoint, but also where the slope of * pos_ratio is most flat and hence task_ratelimit is least fluctuated. */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); /* * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw */ if (unlikely(balanced_dirty_ratelimit > write_bw)) balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: * * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of * * task_ratelimit - dirty_ratelimit * = (pos_ratio - 1) * dirty_ratelimit * * which reflects the direction and size of dirty position error. */ /* * dirty_ratelimit will follow balanced_dirty_ratelimit iff * task_ratelimit is on the same side of dirty_ratelimit, too. * For example, when * - dirty_ratelimit > balanced_dirty_ratelimit * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) * lowering dirty_ratelimit will help meet both the position and rate * control targets. Otherwise, don't update dirty_ratelimit if it will * only help meet the rate target. After all, what the users ultimately * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; /* * For strictlimit case, calculations above were based on wb counters * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } /* * Don't pursue 100% rate matching. It's impossible since the balanced * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ shift = dirty_ratelimit / (2 * step + 1); if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8); else step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; else dirty_ratelimit -= step; WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; unsigned long elapsed; unsigned long dirtied; unsigned long written; spin_lock(&wb->list_lock); /* * Lockless checks for elapsed time are racy and delayed update after * IO completion doesn't do it at all (to make sure written pages are * accounted reasonably quickly). Make sure elapsed >= 1 to avoid * division errors. */ elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* * @mdtc is always NULL if !CGROUP_WRITEBACK but the * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } wb_update_write_bandwidth(wb, elapsed, written); wb->dirtied_stamp = dirtied; wb->written_stamp = written; WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; __wb_update_bandwidth(&gdtc, NULL, false); } /* Interval after which we consider wb idle and don't estimate bandwidth */ #define WB_BANDWIDTH_IDLE_JIF (HZ) static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) { unsigned long now = jiffies; unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); if (elapsed > WB_BANDWIDTH_IDLE_JIF && !atomic_read(&wb->writeback_inodes)) { spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } /* * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, unsigned long thresh) { if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1); return 1; } static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long * time, a small pool of dirty/writeback pages may go empty and disk go * idle. * * 8 serves as the safety ratio. */ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } static long wb_min_pause(struct bdi_writeback *wb, long max_pause, unsigned long task_ratelimit, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ /* target for 10ms pause on 1-dd case */ t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) t += (hi - lo) * (10 * HZ) / 1024; /* * This is a bit convoluted. We try to base the next nr_dirtied_pause * on the much more stable dirty_ratelimit. However the next pause time * will be computed based on task_ratelimit and the two rate limits may * depart considerably at some time. Especially if task_ratelimit goes * below dirty_ratelimit/2 and the target pause is max_pause, the next * pause time will be max_pause*2 _trimmed down_ to max_pause. As a * result task_ratelimit won't be executed faithfully, which could * eventually bring down dirty_ratelimit. * * We apply two rules to fix it up: * 1) try to estimate the next pause time and if necessary, use a lower * nr_dirtied_pause so as not to exceed max_pause. When this happens, * nr_dirtied_pause will be "dancing" with task_ratelimit. * 2) limit the target pause time to max_pause/2, so that the normal * small fluctuations of task_ratelimit won't trigger rule (1) and * nr_dirtied_pause will remain as stable as dirty_ratelimit. */ t = min(t, 1 + max_pause / 2); pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); /* * Tiny nr_dirtied_pause is found to hurt I/O performance in the test * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. * When the 16 consecutive reads are often interrupted by some dirty * throttling pause during the async writes, cfq will go into idles * (deadline is fine). So push nr_dirtied_pause as high as possible * until reaches DIRTY_POLL_THRESH=32 pages. */ if (pages < DIRTY_POLL_THRESH) { t = max_pause; pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); if (pages > DIRTY_POLL_THRESH) { pages = DIRTY_POLL_THRESH; t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; } } pause = HZ * pages / (task_ratelimit + 1); if (pause > max_pause) { t = max_pause; pages = task_ratelimit * t / roundup_pow_of_two(HZ); } *nr_dirtied_pause = pages; /* * The minimal pause time will normally be half the target pause time. */ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; unsigned long wb_reclaimable; /* * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow * go into state (wb_dirty >> wb_thresh) either because * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until wb_dirty drops under * wb_thresh. Instead the auxiliary wb control line in * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, bool strictlimit) { unsigned long dirty, thresh; if (strictlimit) { dirty = dtc->wb_dirty; thresh = dtc->wb_thresh; } else { dirty = dtc->dirty; thresh = dtc->thresh; } return dirty_poll_interval(dirty, thresh); } /* * Throttle it only when the background writeback cannot catch-up. This avoids * (excessively) small writeouts when the wb limits are ramping up in case of * !strictlimit. * * In strictlimit case make decision based on the wb counters and limits. Small * writeouts when the wb limits are ramping up are the price we consciously pay * for strictlimit-ing. */ static void domain_dirty_freerun(struct dirty_throttle_control *dtc, bool strictlimit) { unsigned long dirty, thresh, bg_thresh; if (unlikely(strictlimit)) { wb_dirty_limits(dtc); dirty = dtc->wb_dirty; thresh = dtc->wb_thresh; bg_thresh = dtc->wb_bg_thresh; } else { dirty = dtc->dirty; thresh = dtc->thresh; bg_thresh = dtc->bg_thresh; } dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); } static void balance_domain_limits(struct dirty_throttle_control *dtc, bool strictlimit) { domain_dirty_avail(dtc, true); domain_dirty_limits(dtc); domain_dirty_freerun(dtc, strictlimit); } static void wb_dirty_freerun(struct dirty_throttle_control *dtc, bool strictlimit) { dtc->freerun = false; /* was already handled in domain_dirty_freerun */ if (strictlimit) return; wb_dirty_limits(dtc); /* * LOCAL_THROTTLE tasks must not be throttled when below the per-wb * freerun ceiling. */ if (!(current->flags & PF_LOCAL_THROTTLE)) return; dtc->freerun = dtc->wb_dirty < dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); } static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, bool strictlimit) { dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && ((dtc->dirty > dtc->thresh) || strictlimit); } /* * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is * in freerun state. Please don't use these invalid fields in freerun case. */ static void balance_wb_limits(struct dirty_throttle_control *dtc, bool strictlimit) { wb_dirty_freerun(dtc, strictlimit); if (dtc->freerun) return; wb_dirty_exceeded(dtc, strictlimit); wb_position_ratio(dtc); } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; unsigned long nr_dirty; long period; long pause; long max_pause; long min_pause; int nr_dirtied_pause; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; int ret = 0; for (;;) { unsigned long now = jiffies; nr_dirty = global_node_page_state(NR_FILE_DIRTY); balance_domain_limits(gdtc, strictlimit); if (mdtc) { /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ balance_domain_limits(mdtc, strictlimit); } /* * In laptop mode, we wait until hitting the higher threshold * before starting background writeout, and then write out all * the way down to the lower threshold. So slow writers cause * minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ if (!laptop_mode && nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ if (gdtc->freerun && (!mdtc || mdtc->freerun)) { unsigned long intv; unsigned long m_intv; free_running: intv = domain_poll_intv(gdtc, strictlimit); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) m_intv = domain_poll_intv(mdtc, strictlimit); current->nr_dirtied_pause = min(intv, m_intv); break; } /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); mem_cgroup_flush_foreign(wb); /* * Calculate global domain's pos_ratio and select the * global dtc by default. */ balance_wb_limits(gdtc, strictlimit); if (gdtc->freerun) goto free_running; sdtc = gdtc; if (mdtc) { /* * If memcg domain is in effect, calculate its * pos_ratio. @wb should satisfy constraints from * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ balance_wb_limits(mdtc, strictlimit); if (mdtc->freerun) goto free_running; if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } wb->dirty_exceeded = gdtc->dirty_exceeded || (mdtc && mdtc->dirty_exceeded); if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); min_pause = wb_min_pause(wb, max_pause, task_ratelimit, dirty_ratelimit, &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; pause = max_pause; goto pause; } period = HZ * pages_dirtied / task_ratelimit; pause = period; if (current->dirty_paused_when) pause -= now - current->dirty_paused_when; /* * For less than 1s think time (ext3/4 may block the dirtier * for up to 800ms from time to time on 1-HDD; so does xfs, * however at much less frequency), try to compensate it in * future periods by updating the virtual time; otherwise just * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { trace_balance_dirty_pages(wb, sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, period, min(pause, 0L), start_time); if (pause < -HZ) { current->dirty_paused_when = now; current->nr_dirtied = 0; } else if (period) { current->dirty_paused_when += period; current->nr_dirtied = 0; } else if (current->nr_dirtied_pause <= pages_dirtied) current->nr_dirtied_pause += pages_dirtied; break; } if (unlikely(pause > max_pause)) { /* for occasional dropped task_ratelimit */ now += min(pause - max_pause, max_pause); pause = max_pause; } pause: trace_balance_dirty_pages(wb, sdtc, dirty_ratelimit, task_ratelimit, pages_dirtied, period, pause, start_time); if (flags & BDP_ASYNC) { ret = -EAGAIN; break; } __set_current_state(TASK_KILLABLE); bdi->last_bdp_sleep = jiffies; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; current->nr_dirtied = 0; current->nr_dirtied_pause = nr_dirtied_pause; /* * This is typically equal to (dirty < thresh) and can also * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) break; } return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); /* * Normal tasks are throttled by * loop { * dirty tsk->nr_dirtied_pause pages; * take a snap in balance_dirty_pages(); * } * However there is a worst case. If every task exit immediately when dirtied * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be * called to throttle the page dirties. The solution is to save the not yet * throttled page dirties in dirty_throttle_leaks on task exit and charge them * randomly into the running tasks. This works well for the above worst case, * as the new task will pick up and accumulate the old task's leaked dirty * count and eventually get throttled. */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. * @mapping: address_space which was dirtied. * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * See balance_dirty_pages_ratelimited() for details. * * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to * indicate that memory is out of balance and the caller must wait * for I/O to complete. Otherwise, it will return 0 to indicate * that either memory was already in balance, or it was able to sleep * until the amount of dirty memory returned to balance. */ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); if (!wb) wb = &bdi->wb; ratelimit = current->nr_dirtied_pause; if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without * calling into balance_dirty_pages(), which can happen when there are * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { *p = 0; ratelimit = 0; } /* * Pick up the dirtied pages by the exited tasks. This avoids lots of * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); return ret; } EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** * balance_dirty_pages_ratelimited - balance dirty memory state. * @mapping: address_space which was dirtied. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * Once we're over the dirty memory limit we decrease the ratelimiting * by a lot, to prevent individual processes from overshooting the limit * by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); /* * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty * and thresh, but it's for background writeback. */ static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); if (dtc->wb_bg_thresh < 2 * wb_stat_error()) dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); else dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); } static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) { domain_dirty_avail(dtc, false); domain_dirty_limits(dtc); if (dtc->dirty > dtc->bg_thresh) return true; wb_bg_dirty_limits(dtc); if (dtc->wb_dirty > dtc->wb_bg_thresh) return true; return false; } /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest * * Determines whether background writeback should keep writing @wb or it's * clean enough. * * Return: %true if writeback should continue. */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; if (domain_over_bg_thresh(&gdtc)) return true; if (mdtc_valid(&mdtc)) return domain_over_bg_thresh(&mdtc); return false; } #ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; ret = proc_dointvec(table, write, buffer, length, ppos); /* * Writing 0 to dirty_writeback_interval will disable periodic writeback * and a different non-zero value will wakeup the writeback threads. * wb_wakeup_delayed() would be more appropriate, but it's a pain to * iterate over all bdis and wbs. * The reason we do this is to make the change take effect immediately. */ if (!ret && write && dirty_writeback_interval && dirty_writeback_interval != old_interval) wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; } #endif void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* * We've spun up the disk and we're in laptop mode: schedule writeback * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ void laptop_io_completion(struct backing_dev_info *info) { mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* * We're in laptop mode and we've just synced. The sync's writes will have * caused another writeback to be scheduled by laptop_io_completion. * Nothing needs to be written back anymore, so we unschedule the writeback. */ void laptop_sync_completion(void) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) timer_delete(&bdi->laptop_mode_wb_timer); rcu_read_unlock(); } /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory * thresholds. */ void writeback_set_ratelimit(void) { struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } static int page_writeback_cpu_online(unsigned int cpu) { writeback_set_ratelimit(); return 0; } #ifdef CONFIG_SYSCTL /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #ifdef CONFIG_HIGHMEM { .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; #endif /* * Called early on to tune the page writeback dirty limits. * * We used to scale dirty pages according to how total memory * related to pages that could be allocated for buffers. * * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory, and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. * * But we might still want to scale the dirty_ratio by how * much memory the box has.. */ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", vm_page_writeback_sysctls); #endif } /** * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags * all pages that have DIRTY tag set with a special TOWRITE tag. The caller * can then use the TOWRITE tag to identify pages eligible for writeback. * This mechanism is used to avoid livelocking of writeback by a process * steadily creating new dirty pages in the file (thus it is important for this * function to be quick so that it can tag pages faster than a dirtying process * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); unsigned int tagged = 0; void *page; xas_lock_irq(&xas); xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); if (++tagged % XA_CHECK_SCHED) continue; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); static bool folio_prepare_writeback(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio) { /* * Folio truncated or invalidated. We can freely skip it then, * even for data integrity operations: the folio has disappeared * concurrently, so there could be no real expectation of this * data integrity operation even if there is now a new, dirty * folio at the same pagecache index. */ if (unlikely(folio->mapping != mapping)) return false; /* * Did somebody else write it for us? */ if (!folio_test_dirty(folio)) return false; if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) return false; folio_wait_writeback(folio); } BUG_ON(folio_test_writeback(folio)); if (!folio_clear_dirty_for_io(folio)) return false; return true; } static xa_mark_t wbc_to_tag(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) return PAGECACHE_TAG_TOWRITE; return PAGECACHE_TAG_DIRTY; } static pgoff_t wbc_end(struct writeback_control *wbc) { if (wbc->range_cyclic) return -1; return wbc->range_end >> PAGE_SHIFT; } static struct folio *writeback_get_folio(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio; retry: folio = folio_batch_next(&wbc->fbatch); if (!folio) { folio_batch_release(&wbc->fbatch); cond_resched(); filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), wbc_to_tag(wbc), &wbc->fbatch); folio = folio_batch_next(&wbc->fbatch); if (!folio) return NULL; } folio_lock(folio); if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { folio_unlock(folio); goto retry; } trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); return folio; } /** * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write * @wbc: writeback context * @folio: previously iterated folio (%NULL to start) * @error: in-out pointer for writeback errors (see below) * * This function returns the next folio for the writeback operation described by * @wbc on @mapping and should be called in a while loop in the ->writepages * implementation. * * To start the writeback operation, %NULL is passed in the @folio argument, and * for every subsequent iteration the folio returned previously should be passed * back in. * * If there was an error in the per-folio writeback inside the writeback_iter() * loop, @error should be set to the error value. * * Once the writeback described in @wbc has finished, this function will return * %NULL and if there was an error in any iteration restore it to @error. * * Note: callers should not manually break out of the loop using break or goto * but must keep calling writeback_iter() until it returns %NULL. * * Return: the folio to write or %NULL if the loop is done. */ struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error) { if (!folio) { folio_batch_init(&wbc->fbatch); wbc->saved_err = *error = 0; /* * For range cyclic writeback we remember where we stopped so * that we can continue where we stopped. * * For non-cyclic writeback we always start at the beginning of * the passed in range. */ if (wbc->range_cyclic) wbc->index = mapping->writeback_index; else wbc->index = wbc->range_start >> PAGE_SHIFT; /* * To avoid livelocks when other processes dirty new pages, we * first tag pages which should be written back and only then * start writing them. * * For data-integrity writeback we have to be careful so that we * do not miss some pages (e.g., because some other process has * cleared the TOWRITE tag we set). The rule we follow is that * TOWRITE tag can be cleared only by the process clearing the * DIRTY tag (and submitting the page for I/O). */ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, wbc->index, wbc_end(wbc)); } else { wbc->nr_to_write -= folio_nr_pages(folio); WARN_ON_ONCE(*error > 0); /* * For integrity writeback we have to keep going until we have * written all the folios we tagged for writeback above, even if * we run past wbc->nr_to_write or encounter errors. * We stash away the first error we encounter in wbc->saved_err * so that it can be retrieved when we're done. This is because * the file system may still have state to clear for each folio. * * For background writeback we exit as soon as we run past * wbc->nr_to_write or encounter the first error. */ if (wbc->sync_mode == WB_SYNC_ALL) { if (*error && !wbc->saved_err) wbc->saved_err = *error; } else { if (*error || wbc->nr_to_write <= 0) goto done; } } folio = writeback_get_folio(mapping, wbc); if (!folio) { /* * To avoid deadlocks between range_cyclic writeback and callers * that hold folios in writeback to aggregate I/O until * the writeback iteration finishes, we do not loop back to the * start of the file. Doing so causes a folio lock/folio * writeback access order inversion - we should only ever lock * multiple folios in ascending folio->index order, and looping * back to the start of the file violates that rule and causes * deadlocks. */ if (wbc->range_cyclic) mapping->writeback_index = 0; /* * Return the first error we encountered (if there was any) to * the caller. */ *error = wbc->saved_err; } return folio; done: if (wbc->range_cyclic) mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } EXPORT_SYMBOL_GPL(writeback_iter); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; wb = inode_to_wb_wbc(mapping->host, wbc); wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else /* deal with chardevs and other special files */ ret = 0; if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL) break; /* * Lacking an allocation context or the locality or writeback * state of any of the inode's pages, throttle based on * writeback activity on the local node. It's as good a * guess as any. */ reclaim_throttle(NODE_DATA(numa_node_id()), VMSCAN_THROTTLE_WRITEBACK); } /* * Usually few pages are written by now from those we've just submitted * but if there's constant writeback being submitted, this makes sure * writeback bandwidth is updated once in a while. */ if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) wb_update_bandwidth(wb); return ret; } /* * For address_spaces which do not use buffers nor write back. */ bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return !folio_test_set_dirty(folio); return false; } EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, struct address_space *mapping) { struct inode *inode = mapping->host; trace_writeback_dirty_folio(folio, mapping); if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; long nr = folio_nr_pages(folio); inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); wb_stat_mod(wb, WB_DIRTIED, nr); task_io_account_write(nr * PAGE_SIZE); current->nr_dirtied += nr; __this_cpu_add(bdp_ratelimits, nr); mem_cgroup_track_foreign_dirty(folio, wb); } } /* * Helper function for deaccounting dirty page without writeback. * */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); task_io_account_cancelled_write(nr * PAGE_SIZE); } /* * Mark the folio dirty, and set it dirty in the page cache. * * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * * It is the caller's responsibility to prevent the folio from being truncated * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. * zap_vma_pages() has it mapped and is holding the page table lock). * When called from mark_buffer_dirty(), the filesystem should hold a * reference to the buffer_head that is being marked dirty, which causes * try_to_free_buffers() to fail. */ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, int warn) { unsigned long flags; /* * Shmem writeback relies on swap, and swap writeback is LRU based, * not using the dirty mark. */ VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); __xa_set_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } /** * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. * @mapping: Address space this folio belongs to. * @folio: Folio to be marked as dirty. * * Filesystems which do not use buffer heads should call this function * from their dirty_folio address space operation. It ignores the * contents of folio_get_private(), so if the filesystem marks individual * blocks as dirty, the filesystem should handle that itself. * * This is also sometimes used by filesystems which use buffer_heads when * a single buffer is being dirtied: we want to set the folio dirty in * that case, but not all the buffers. This is a "bottom-up" dirtying, * whereas block_dirty_folio() is a "top-down" dirtying. * * The caller must ensure this doesn't race with truncation. Most will * simply hold the folio lock, but e.g. zap_pte_range() calls with the * folio mapped and the pte lock held, which also locks out truncation. */ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { if (folio_test_set_dirty(folio)) return false; __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return true; } EXPORT_SYMBOL(filemap_dirty_folio); /** * folio_redirty_for_writepage - Decline to write a dirty folio. * @wbc: The writeback control. * @folio: The folio. * * When a writepage implementation decides that it doesn't want to write * @folio for some reason, it should call this function, unlock @folio and * return 0. * * Return: True if we redirtied the folio. False if someone else dirtied * it first. */ bool folio_redirty_for_writepage(struct writeback_control *wbc, struct folio *folio) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); bool ret; wbc->pages_skipped += nr; ret = filemap_dirty_folio(mapping, folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied -= nr; node_stat_mod_folio(folio, NR_DIRTIED, -nr); wb_stat_mod(wb, WB_DIRTIED, -nr); unlocked_inode_to_wb_end(inode, &cookie); } return ret; } EXPORT_SYMBOL(folio_redirty_for_writepage); /** * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * * The folio may not be truncated while this function is running. * Holding the folio lock is sufficient to prevent truncation, but some * callers cannot acquire a sleeping lock. These callers instead hold * the page table lock for a page table which contains at least one page * in this folio. Truncation will block on the page table lock as it * unmaps pages before removing the folio from its mapping. * * Return: True if the folio was newly dirtied, false if it was already dirty. */ bool folio_mark_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (likely(mapping)) { /* * readahead/folio_deactivate could remain * PG_readahead/PG_reclaim due to race with folio_end_writeback * About readahead, if the folio is written, the flags would be * reset. So no problem. * About folio_deactivate, if the folio is redirtied, * the flag will be reset. So no problem. but if the * folio is used by readahead it will confuse readahead * and make it restart the size rampup process. But it's * a trivial problem. */ if (folio_test_reclaim(folio)) folio_clear_reclaim(folio); return mapping->a_ops->dirty_folio(mapping, folio); } return noop_dirty_folio(mapping, folio); } EXPORT_SYMBOL(folio_mark_dirty); /* * folio_mark_dirty() is racy if the caller has no reference against * folio->mapping->host, and if the folio is unlocked. This is because another * CPU could truncate the folio off the mapping and then free the mapping. * * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * * In other cases, the folio should be locked before running folio_mark_dirty(). */ bool folio_mark_dirty_lock(struct folio *folio) { bool ret; folio_lock(folio); ret = folio_mark_dirty(folio); folio_unlock(folio); return ret; } EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT * actually remove dirty bits on any mmap's that may be around. It also * leaves the page tagged dirty, so any sync activity will still find it on * the dirty lists, and in particular, clear_page_dirty_for_io() will still * look at the dirty bits in the VM. * * Doing this should *normally* only ever be done when a page is truncated, * and is not actually mapped anywhere at all. However, fs/buffer.c does * this when it notices that somebody has cleaned out all the buffers on a * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ void __folio_cancel_dirty(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); } else { folio_clear_dirty(folio); } } EXPORT_SYMBOL(__folio_cancel_dirty); /* * Clear a folio's dirty flag, while caring for dirty memory accounting. * Returns true if the folio was previously dirty. * * This is for preparing to put the folio under writeout. We leave * the folio tagged as dirty in the xarray so that a concurrent * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. * The ->writepage implementation will run either folio_start_writeback() * or folio_mark_dirty(), at which stage we bring the folio's dirty flag * and xarray dirty tag back into sync. * * This incoherency between the folio's dirty flag and xarray tag is * unfortunate, but it only exists while the folio is locked. */ bool folio_clear_dirty_for_io(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); bool ret = false; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. * * We use this sequence to make sure that * (a) we account for dirty stats properly * (b) we tell the low-level filesystem to * mark the whole folio dirty if it was * dirty in a pagetable. Only to then * (c) clean the folio again and return 1 to * cause the writeback. * * This way we avoid all nasty races with the * dirty bit in multiple places and clearing * them concurrently from different threads. * * Note! Normally the "folio_mark_dirty(folio)" * has no effect on the actual dirty bit - since * that will already usually be set. But we * need the side effects, and it can help us * avoid races. * * We basically use the folio "master dirty bit" * as a serialization point for all the different * threads doing their things. */ if (folio_mkclean(folio)) folio_mark_dirty(folio); /* * We carefully synchronise fault handlers against * installing a dirty pte and marking the folio dirty * at this point. We do this by having them hold the * page lock while dirtying the folio, and folios are * always locked coming in here, so we get the desired * exclusion. */ wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); ret = true; } unlocked_inode_to_wb_end(inode, &cookie); return ret; } return folio_test_clear_dirty(folio); } EXPORT_SYMBOL(folio_clear_dirty_for_io); static void wb_inode_writeback_start(struct bdi_writeback *wb) { atomic_inc(&wb->writeback_inodes); } static void wb_inode_writeback_end(struct bdi_writeback *wb) { unsigned long flags; atomic_dec(&wb->writeback_inodes); /* * Make sure estimate of writeback throughput gets updated after * writeback completed. We delay the update by BANDWIDTH_INTERVAL * (which is the interval other bandwidth updates use for batching) so * that if multiple inodes end writeback at a similar time, they get * batched into one bandwidth update. */ spin_lock_irqsave(&wb->work_lock, flags); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); spin_unlock_irqrestore(&wb->work_lock, flags); } bool __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); bool ret; if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_WRITEBACK); wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, -nr); __wb_writeout_add(wb, nr); if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { wb_inode_writeback_end(wb); if (mapping->host) sb_clear_inode_writeback(mapping->host); } xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); return ret; } void __folio_start_writeback(struct folio *folio, bool keep_write) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); int access_ret; VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; struct bdi_writeback *wb; unsigned long flags; bool on_wblist; xas_lock_irqsave(&xas, flags); xas_load(&xas); folio_test_set_writeback(folio); on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); wb = inode_to_wb(inode); wb_stat_mod(wb, WB_WRITEBACK, nr); if (!on_wblist) { wb_inode_writeback_start(wb); /* * We can come through here when swapping anonymous * folios, so we don't necessarily have an inode to * track for sync. */ if (mapping->host) sb_mark_inode_writeback(mapping->host); } if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { folio_test_set_writeback(folio); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_FOLIO(access_ret != 0, folio); } EXPORT_SYMBOL(__folio_start_writeback); /** * folio_wait_writeback - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_writeback(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); folio_wait_bit(folio, PG_writeback); } } EXPORT_SYMBOL_GPL(folio_wait_writeback); /** * folio_wait_writeback_killable - Wait for a folio to finish writeback. * @folio: The folio to wait for. * * If the folio is currently being written back to storage, wait for the * I/O to complete or a fatal signal to arrive. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. * Return: 0 on success, -EINTR if we get a fatal signal while waiting. */ int folio_wait_writeback_killable(struct folio *folio) { while (folio_test_writeback(folio)) { trace_folio_wait_writeback(folio, folio_mapping(folio)); if (folio_wait_bit_killable(folio, PG_writeback)) return -EINTR; } return 0; } EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); /** * folio_wait_stable() - wait for writeback to finish, if necessary. * @folio: The folio to wait on. * * This function determines if the given folio is related to a backing * device that requires folio contents to be held stable during writeback. * If so, then it will wait for any pending writeback to complete. * * Context: Sleeps. Must be called in process context and with * no spinlocks held. Caller should hold a reference on the folio. * If the folio is not locked, writeback may start again after writeback * has finished. */ void folio_wait_stable(struct folio *folio) { if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable);
2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* Protective Load Balancing (PLB) * * PLB was designed to reduce link load imbalance across datacenter * switches. PLB is a host-based optimization; it leverages congestion * signals from the transport layer to randomly change the path of the * connection experiencing sustained congestion. PLB prefers to repath * after idle periods to minimize packet reordering. It repaths by * changing the IPv6 Flow Label on the packets of a connection, which * datacenter switches include as part of ECMP/WCMP hashing. * * PLB is described in detail in: * * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, * David Wetherall,Abdul Kabbani: * "PLB: Congestion Signals are Simple and Effective for * Network Load Balancing" * In ACM SIGCOMM 2022, Amsterdam Netherlands. * */ #include <net/tcp.h> /* Called once per round-trip to update PLB state for a connection. */ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, const int cong_ratio) { struct net *net = sock_net(sk); if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; if (cong_ratio >= 0) { if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) plb->consec_cong_rounds = 0; else if (plb->consec_cong_rounds < READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) plb->consec_cong_rounds++; } } EXPORT_SYMBOL_GPL(tcp_plb_update_state); /* Check whether recent congestion has been persistent enough to warrant * a load balancing decision that switches the connection to another path. */ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) { struct net *net = sock_net(sk); u32 max_suspend; bool forced_rehash = false, idle_rehash = false; if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; forced_rehash = plb->consec_cong_rounds >= READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); /* If sender goes idle then we check whether to rehash. */ idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && !tcp_sk(sk)->packets_out && plb->consec_cong_rounds >= READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); if (!forced_rehash && !idle_rehash) return; /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for * cases where the max suspension end is before the actual suspension * end. We clear pause_until to 0 to indicate there is no recent * RTO event that constrains PLB rehashing. */ max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; if (plb->pause_until && (!before(tcp_jiffies32, plb->pause_until) || before(tcp_jiffies32 + max_suspend, plb->pause_until))) plb->pause_until = 0; if (plb->pause_until) return; sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; tcp_sk(sk)->plb_rehash++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); /* Upon RTO, disallow load balancing for a while, to avoid having load * balancing decisions switch traffic to a black-holed path that was * previously avoided with a sk_rethink_txhash() call at RTO time. */ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) { struct net *net = sock_net(sk); u32 pause; if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; pause += get_random_u32_below(pause); plb->pause_until = tcp_jiffies32 + pause; /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call * that may switch this connection to a path with completely different * congestion characteristics. */ plb->consec_cong_rounds = 0; } EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
544 5765 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM exceptions #if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(exceptions, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code), TP_STRUCT__entry( __field( unsigned long, address ) __field( unsigned long, ip ) __field( unsigned long, error_code ) ), TP_fast_assign( __entry->address = address; __entry->ip = instruction_pointer(regs); __entry->error_code = error_code; ), TP_printk("address=%ps ip=%ps error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); DEFINE_EVENT(exceptions, page_fault_user, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code)); DEFINE_EVENT(exceptions, page_fault_kernel, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code)); #endif /* _TRACE_PAGE_FAULT_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
197 198 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res = -EINVAL; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) { if (index == 0) { if (try_module_get(tmp->owner)) res = 0; break; } } read_unlock(&file_systems_lock); if (res) return res; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 // SPDX-License-Identifier: GPL-2.0 /* * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H #include <linux/hrtimer_defs.h> #include <linux/hrtimer_types.h> #include <linux/init.h> #include <linux/list.h> #include <linux/percpu-defs.h> #include <linux/rbtree.h> #include <linux/timer.h> /* * Mode arguments of xxx_hrtimer functions: * * HRTIMER_MODE_ABS - Time value is absolute * HRTIMER_MODE_REL - Time value is relative to now * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered * when starting the timer) * HRTIMER_MODE_SOFT - Timer callback function will be executed in * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; /* * Values to track state of the timer * * Possible states: * * 0x00 inactive * 0x01 enqueued into rbtree * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free * the timer from the callback function. * * Therefore we track the callback state in: * * timer->base->cpu_base->running == timer * * On SMP it is possible to have a "callback function running and enqueued" * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up * * task is set to NULL, when the timer expires. */ struct hrtimer_sleeper { struct hrtimer timer; struct task_struct *task; }; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; timer->_softexpires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { return timer->node.expires; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { return ktime_to_ns(timer->node.expires); } ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? timer->base->cpu_base->hres_active : 0; } #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC #endif static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { ktime_t rem = ktime_sub(timer->node.expires, now); /* * Adjust relative timers for the extra we added in * hrtimer_start_range_ns() to prevent short timeouts. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) rem -= hrtimer_resolution; return rem; } static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif DECLARE_PER_CPU(struct tick_device, tick_cpu_device); #ifdef CONFIG_PREEMPT_RT void hrtimer_cancel_wait_running(const struct hrtimer *timer); #else static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) { cpu_relax(); } #endif static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) { return HRTIMER_NORESTART; } /* Exported timer functions: */ /* Initialize timers: */ extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); static inline void hrtimer_restart(struct hrtimer *timer) { hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read */ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); } extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** * hrtimer_is_queued - check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise * * The function can be used lockless, but it gives only a current snapshot. */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { /* The READ_ONCE pairs with the update functions of timer->state */ return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* * Helper function to check, whether the timer is running the callback * function */ static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } /** * hrtimer_update_function - Update the timer's callback function * @timer: Timer to update * @function: New callback function * * Only safe to call if the timer is not enqueued. Can be called in the callback function if the * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED). */ static inline void hrtimer_update_function(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *)) { #ifdef CONFIG_PROVE_LOCKING guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); if (WARN_ON_ONCE(hrtimer_is_queued(timer))) return; if (WARN_ON_ONCE(!function)) return; #endif ACCESS_PRIVATE(timer, function) = function; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** * hrtimer_forward_now() - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * * It is a variant of hrtimer_forward(). The timer will expire after the current * time of the hrtimer clock base. See hrtimer_forward() for details. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); int hrtimers_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); #else #define hrtimers_cpu_dying NULL #endif #endif
5 5 127 127 127 83 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } /* * ima_reset_appraise_flags - reset ima_iint_cache flags * * @digsig: whether to clear/set IMA_DIGSIG flag, tristate values * 0: clear IMA_DIGSIG * 1: set IMA_DIGSIG * -1: don't change IMA_DIGSIG * */ static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig == 1) set_bit(IMA_DIGSIG, &iint->atomic_flags); else if (digsig == 0) clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } else { digsig = -1; } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), -1); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result, digsig = -1; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { if (!strcmp(xattr_name, XATTR_NAME_IMA)) digsig = 0; ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 /* * Cryptographic API. * * Khazad Algorithm * * The Khazad algorithm was developed by Paulo S. L. M. Barreto and * Vincent Rijmen. It was a finalist in the NESSIE encryption contest. * * The original authors have disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * By Aaron Grothe ajgrothe@yahoo.com, August 1, 2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/unaligned.h> #include <linux/types.h> #define KHAZAD_KEY_SIZE 16 #define KHAZAD_BLOCK_SIZE 8 #define KHAZAD_ROUNDS 8 struct khazad_ctx { u64 E[KHAZAD_ROUNDS + 1]; u64 D[KHAZAD_ROUNDS + 1]; }; static const u64 T0[256] = { 0xbad3d268bbb96a01ULL, 0x54fc4d19e59a66b1ULL, 0x2f71bc93e26514cdULL, 0x749ccdb925871b51ULL, 0x53f55102f7a257a4ULL, 0xd3686bb8d0d6be03ULL, 0xd26b6fbdd6deb504ULL, 0x4dd72964b35285feULL, 0x50f05d0dfdba4aadULL, 0xace98a26cf09e063ULL, 0x8d8a0e83091c9684ULL, 0xbfdcc679a5914d1aULL, 0x7090ddad3da7374dULL, 0x52f65507f1aa5ca3ULL, 0x9ab352c87ba417e1ULL, 0x4cd42d61b55a8ef9ULL, 0xea238f65460320acULL, 0xd56273a6c4e68411ULL, 0x97a466f155cc68c2ULL, 0xd16e63b2dcc6a80dULL, 0x3355ccffaa85d099ULL, 0x51f35908fbb241aaULL, 0x5bed712ac7e20f9cULL, 0xa6f7a204f359ae55ULL, 0xde7f5f81febec120ULL, 0x48d83d75ad7aa2e5ULL, 0xa8e59a32d729cc7fULL, 0x99b65ec771bc0ae8ULL, 0xdb704b90e096e63bULL, 0x3256c8faac8ddb9eULL, 0xb7c4e65195d11522ULL, 0xfc19d72b32b3aaceULL, 0xe338ab48704b7393ULL, 0x9ebf42dc63843bfdULL, 0x91ae7eef41fc52d0ULL, 0x9bb056cd7dac1ce6ULL, 0xe23baf4d76437894ULL, 0xbbd0d66dbdb16106ULL, 0x41c319589b32f1daULL, 0x6eb2a5cb7957e517ULL, 0xa5f2ae0bf941b35cULL, 0xcb400bc08016564bULL, 0x6bbdb1da677fc20cULL, 0x95a26efb59dc7eccULL, 0xa1febe1fe1619f40ULL, 0xf308eb1810cbc3e3ULL, 0xb1cefe4f81e12f30ULL, 0x0206080a0c10160eULL, 0xcc4917db922e675eULL, 0xc45137f3a26e3f66ULL, 0x1d2774694ee8cf53ULL, 0x143c504478a09c6cULL, 0xc3582be8b0560e73ULL, 0x63a591f2573f9a34ULL, 0xda734f95e69eed3cULL, 0x5de76934d3d2358eULL, 0x5fe1613edfc22380ULL, 0xdc79578bf2aed72eULL, 0x7d87e99413cf486eULL, 0xcd4a13de94266c59ULL, 0x7f81e19e1fdf5e60ULL, 0x5aee752fc1ea049bULL, 0x6cb4adc17547f319ULL, 0x5ce46d31d5da3e89ULL, 0xf704fb0c08ebefffULL, 0x266a98bed42d47f2ULL, 0xff1cdb2438abb7c7ULL, 0xed2a937e543b11b9ULL, 0xe825876f4a1336a2ULL, 0x9dba4ed3699c26f4ULL, 0x6fb1a1ce7f5fee10ULL, 0x8e8f028c03048b8dULL, 0x192b647d56c8e34fULL, 0xa0fdba1ae7699447ULL, 0xf00de7171ad3deeaULL, 0x89861e97113cba98ULL, 0x0f113c332278692dULL, 0x07091c1b12383115ULL, 0xafec8629c511fd6aULL, 0xfb10cb30208b9bdbULL, 0x0818202830405838ULL, 0x153f54417ea8976bULL, 0x0d1734392e687f23ULL, 0x040c101418202c1cULL, 0x0103040506080b07ULL, 0x64ac8de94507ab21ULL, 0xdf7c5b84f8b6ca27ULL, 0x769ac5b329970d5fULL, 0x798bf9800bef6472ULL, 0xdd7a538ef4a6dc29ULL, 0x3d47f4c98ef5b2b3ULL, 0x163a584e74b08a62ULL, 0x3f41fcc382e5a4bdULL, 0x3759dcebb2a5fc85ULL, 0x6db7a9c4734ff81eULL, 0x3848e0d890dd95a8ULL, 0xb9d6de67b1a17708ULL, 0x7395d1a237bf2a44ULL, 0xe926836a4c1b3da5ULL, 0x355fd4e1beb5ea8bULL, 0x55ff491ce3926db6ULL, 0x7193d9a83baf3c4aULL, 0x7b8df18a07ff727cULL, 0x8c890a860f149d83ULL, 0x7296d5a731b72143ULL, 0x88851a921734b19fULL, 0xf607ff090ee3e4f8ULL, 0x2a7ea882fc4d33d6ULL, 0x3e42f8c684edafbaULL, 0x5ee2653bd9ca2887ULL, 0x27699cbbd2254cf5ULL, 0x46ca0543890ac0cfULL, 0x0c14303c28607424ULL, 0x65af89ec430fa026ULL, 0x68b8bdd56d67df05ULL, 0x61a399f85b2f8c3aULL, 0x03050c0f0a181d09ULL, 0xc15e23e2bc46187dULL, 0x57f94116ef827bb8ULL, 0xd6677fa9cefe9918ULL, 0xd976439aec86f035ULL, 0x58e87d25cdfa1295ULL, 0xd875479fea8efb32ULL, 0x66aa85e34917bd2fULL, 0xd7647bacc8f6921fULL, 0x3a4ee8d29ccd83a6ULL, 0xc84507cf8a0e4b42ULL, 0x3c44f0cc88fdb9b4ULL, 0xfa13cf35268390dcULL, 0x96a762f453c463c5ULL, 0xa7f4a601f551a552ULL, 0x98b55ac277b401efULL, 0xec29977b52331abeULL, 0xb8d5da62b7a97c0fULL, 0xc7543bfca876226fULL, 0xaeef822cc319f66dULL, 0x69bbb9d06b6fd402ULL, 0x4bdd317aa762bfecULL, 0xabe0963ddd31d176ULL, 0xa9e69e37d121c778ULL, 0x67a981e64f1fb628ULL, 0x0a1e28223c504e36ULL, 0x47c901468f02cbc8ULL, 0xf20bef1d16c3c8e4ULL, 0xb5c2ee5b99c1032cULL, 0x226688aacc0d6beeULL, 0xe532b356647b4981ULL, 0xee2f9f715e230cb0ULL, 0xbedfc27ca399461dULL, 0x2b7dac87fa4538d1ULL, 0x819e3ebf217ce2a0ULL, 0x1236485a6c90a67eULL, 0x839836b52d6cf4aeULL, 0x1b2d6c775ad8f541ULL, 0x0e1238362470622aULL, 0x23658cafca0560e9ULL, 0xf502f30604fbf9f1ULL, 0x45cf094c8312ddc6ULL, 0x216384a5c61576e7ULL, 0xce4f1fd19e3e7150ULL, 0x49db3970ab72a9e2ULL, 0x2c74b09ce87d09c4ULL, 0xf916c33a2c9b8dd5ULL, 0xe637bf596e635488ULL, 0xb6c7e25493d91e25ULL, 0x2878a088f05d25d8ULL, 0x17395c4b72b88165ULL, 0x829b32b02b64ffa9ULL, 0x1a2e68725cd0fe46ULL, 0x8b80169d1d2cac96ULL, 0xfe1fdf213ea3bcc0ULL, 0x8a8312981b24a791ULL, 0x091b242d3648533fULL, 0xc94603ca8c064045ULL, 0x879426a1354cd8b2ULL, 0x4ed2256bb94a98f7ULL, 0xe13ea3427c5b659dULL, 0x2e72b896e46d1fcaULL, 0xe431b75362734286ULL, 0xe03da7477a536e9aULL, 0xeb208b60400b2babULL, 0x90ad7aea47f459d7ULL, 0xa4f1aa0eff49b85bULL, 0x1e22786644f0d25aULL, 0x85922eab395ccebcULL, 0x60a09dfd5d27873dULL, 0x0000000000000000ULL, 0x256f94b1de355afbULL, 0xf401f70302f3f2f6ULL, 0xf10ee3121cdbd5edULL, 0x94a16afe5fd475cbULL, 0x0b1d2c273a584531ULL, 0xe734bb5c686b5f8fULL, 0x759fc9bc238f1056ULL, 0xef2c9b74582b07b7ULL, 0x345cd0e4b8bde18cULL, 0x3153c4f5a695c697ULL, 0xd46177a3c2ee8f16ULL, 0xd06d67b7dacea30aULL, 0x869722a43344d3b5ULL, 0x7e82e59b19d75567ULL, 0xadea8e23c901eb64ULL, 0xfd1ad32e34bba1c9ULL, 0x297ba48df6552edfULL, 0x3050c0f0a09dcd90ULL, 0x3b4decd79ac588a1ULL, 0x9fbc46d9658c30faULL, 0xf815c73f2a9386d2ULL, 0xc6573ff9ae7e2968ULL, 0x13354c5f6a98ad79ULL, 0x060a181e14303a12ULL, 0x050f14111e28271bULL, 0xc55233f6a4663461ULL, 0x113344556688bb77ULL, 0x7799c1b62f9f0658ULL, 0x7c84ed9115c74369ULL, 0x7a8ef58f01f7797bULL, 0x7888fd850de76f75ULL, 0x365ad8eeb4adf782ULL, 0x1c24706c48e0c454ULL, 0x394be4dd96d59eafULL, 0x59eb7920cbf21992ULL, 0x1828607850c0e848ULL, 0x56fa4513e98a70bfULL, 0xb3c8f6458df1393eULL, 0xb0cdfa4a87e92437ULL, 0x246c90b4d83d51fcULL, 0x206080a0c01d7de0ULL, 0xb2cbf2408bf93239ULL, 0x92ab72e04be44fd9ULL, 0xa3f8b615ed71894eULL, 0xc05d27e7ba4e137aULL, 0x44cc0d49851ad6c1ULL, 0x62a695f751379133ULL, 0x103040506080b070ULL, 0xb4c1ea5e9fc9082bULL, 0x84912aae3f54c5bbULL, 0x43c511529722e7d4ULL, 0x93a876e54dec44deULL, 0xc25b2fedb65e0574ULL, 0x4ade357fa16ab4ebULL, 0xbddace73a9815b14ULL, 0x8f8c0689050c808aULL, 0x2d77b499ee7502c3ULL, 0xbcd9ca76af895013ULL, 0x9cb94ad66f942df3ULL, 0x6abeb5df6177c90bULL, 0x40c01d5d9d3afaddULL, 0xcf4c1bd498367a57ULL, 0xa2fbb210eb798249ULL, 0x809d3aba2774e9a7ULL, 0x4fd1216ebf4293f0ULL, 0x1f217c6342f8d95dULL, 0xca430fc5861e5d4cULL, 0xaae39238db39da71ULL, 0x42c61557912aecd3ULL }; static const u64 T1[256] = { 0xd3ba68d2b9bb016aULL, 0xfc54194d9ae5b166ULL, 0x712f93bc65e2cd14ULL, 0x9c74b9cd8725511bULL, 0xf5530251a2f7a457ULL, 0x68d3b86bd6d003beULL, 0x6bd2bd6fded604b5ULL, 0xd74d642952b3fe85ULL, 0xf0500d5dbafdad4aULL, 0xe9ac268a09cf63e0ULL, 0x8a8d830e1c098496ULL, 0xdcbf79c691a51a4dULL, 0x9070addda73d4d37ULL, 0xf6520755aaf1a35cULL, 0xb39ac852a47be117ULL, 0xd44c612d5ab5f98eULL, 0x23ea658f0346ac20ULL, 0x62d5a673e6c41184ULL, 0xa497f166cc55c268ULL, 0x6ed1b263c6dc0da8ULL, 0x5533ffcc85aa99d0ULL, 0xf3510859b2fbaa41ULL, 0xed5b2a71e2c79c0fULL, 0xf7a604a259f355aeULL, 0x7fde815fbefe20c1ULL, 0xd848753d7aade5a2ULL, 0xe5a8329a29d77fccULL, 0xb699c75ebc71e80aULL, 0x70db904b96e03be6ULL, 0x5632fac88dac9edbULL, 0xc4b751e6d1952215ULL, 0x19fc2bd7b332ceaaULL, 0x38e348ab4b709373ULL, 0xbf9edc428463fd3bULL, 0xae91ef7efc41d052ULL, 0xb09bcd56ac7de61cULL, 0x3be24daf43769478ULL, 0xd0bb6dd6b1bd0661ULL, 0xc3415819329bdaf1ULL, 0xb26ecba5577917e5ULL, 0xf2a50bae41f95cb3ULL, 0x40cbc00b16804b56ULL, 0xbd6bdab17f670cc2ULL, 0xa295fb6edc59cc7eULL, 0xfea11fbe61e1409fULL, 0x08f318ebcb10e3c3ULL, 0xceb14ffee181302fULL, 0x06020a08100c0e16ULL, 0x49ccdb172e925e67ULL, 0x51c4f3376ea2663fULL, 0x271d6974e84e53cfULL, 0x3c144450a0786c9cULL, 0x58c3e82b56b0730eULL, 0xa563f2913f57349aULL, 0x73da954f9ee63cedULL, 0xe75d3469d2d38e35ULL, 0xe15f3e61c2df8023ULL, 0x79dc8b57aef22ed7ULL, 0x877d94e9cf136e48ULL, 0x4acdde132694596cULL, 0x817f9ee1df1f605eULL, 0xee5a2f75eac19b04ULL, 0xb46cc1ad477519f3ULL, 0xe45c316ddad5893eULL, 0x04f70cfbeb08ffefULL, 0x6a26be982dd4f247ULL, 0x1cff24dbab38c7b7ULL, 0x2aed7e933b54b911ULL, 0x25e86f87134aa236ULL, 0xba9dd34e9c69f426ULL, 0xb16fcea15f7f10eeULL, 0x8f8e8c0204038d8bULL, 0x2b197d64c8564fe3ULL, 0xfda01aba69e74794ULL, 0x0df017e7d31aeadeULL, 0x8689971e3c1198baULL, 0x110f333c78222d69ULL, 0x09071b1c38121531ULL, 0xecaf298611c56afdULL, 0x10fb30cb8b20db9bULL, 0x1808282040303858ULL, 0x3f154154a87e6b97ULL, 0x170d3934682e237fULL, 0x0c04141020181c2cULL, 0x030105040806070bULL, 0xac64e98d074521abULL, 0x7cdf845bb6f827caULL, 0x9a76b3c597295f0dULL, 0x8b7980f9ef0b7264ULL, 0x7add8e53a6f429dcULL, 0x473dc9f4f58eb3b2ULL, 0x3a164e58b074628aULL, 0x413fc3fce582bda4ULL, 0x5937ebdca5b285fcULL, 0xb76dc4a94f731ef8ULL, 0x4838d8e0dd90a895ULL, 0xd6b967dea1b10877ULL, 0x9573a2d1bf37442aULL, 0x26e96a831b4ca53dULL, 0x5f35e1d4b5be8beaULL, 0xff551c4992e3b66dULL, 0x9371a8d9af3b4a3cULL, 0x8d7b8af1ff077c72ULL, 0x898c860a140f839dULL, 0x9672a7d5b7314321ULL, 0x8588921a34179fb1ULL, 0x07f609ffe30ef8e4ULL, 0x7e2a82a84dfcd633ULL, 0x423ec6f8ed84baafULL, 0xe25e3b65cad98728ULL, 0x6927bb9c25d2f54cULL, 0xca4643050a89cfc0ULL, 0x140c3c3060282474ULL, 0xaf65ec890f4326a0ULL, 0xb868d5bd676d05dfULL, 0xa361f8992f5b3a8cULL, 0x05030f0c180a091dULL, 0x5ec1e22346bc7d18ULL, 0xf957164182efb87bULL, 0x67d6a97ffece1899ULL, 0x76d99a4386ec35f0ULL, 0xe858257dfacd9512ULL, 0x75d89f478eea32fbULL, 0xaa66e38517492fbdULL, 0x64d7ac7bf6c81f92ULL, 0x4e3ad2e8cd9ca683ULL, 0x45c8cf070e8a424bULL, 0x443cccf0fd88b4b9ULL, 0x13fa35cf8326dc90ULL, 0xa796f462c453c563ULL, 0xf4a701a651f552a5ULL, 0xb598c25ab477ef01ULL, 0x29ec7b973352be1aULL, 0xd5b862daa9b70f7cULL, 0x54c7fc3b76a86f22ULL, 0xefae2c8219c36df6ULL, 0xbb69d0b96f6b02d4ULL, 0xdd4b7a3162a7ecbfULL, 0xe0ab3d9631dd76d1ULL, 0xe6a9379e21d178c7ULL, 0xa967e6811f4f28b6ULL, 0x1e0a2228503c364eULL, 0xc9474601028fc8cbULL, 0x0bf21defc316e4c8ULL, 0xc2b55beec1992c03ULL, 0x6622aa880dccee6bULL, 0x32e556b37b648149ULL, 0x2fee719f235eb00cULL, 0xdfbe7cc299a31d46ULL, 0x7d2b87ac45fad138ULL, 0x9e81bf3e7c21a0e2ULL, 0x36125a48906c7ea6ULL, 0x9883b5366c2daef4ULL, 0x2d1b776cd85a41f5ULL, 0x120e363870242a62ULL, 0x6523af8c05cae960ULL, 0x02f506f3fb04f1f9ULL, 0xcf454c091283c6ddULL, 0x6321a58415c6e776ULL, 0x4fced11f3e9e5071ULL, 0xdb49703972abe2a9ULL, 0x742c9cb07de8c409ULL, 0x16f93ac39b2cd58dULL, 0x37e659bf636e8854ULL, 0xc7b654e2d993251eULL, 0x782888a05df0d825ULL, 0x39174b5cb8726581ULL, 0x9b82b032642ba9ffULL, 0x2e1a7268d05c46feULL, 0x808b9d162c1d96acULL, 0x1ffe21dfa33ec0bcULL, 0x838a9812241b91a7ULL, 0x1b092d2448363f53ULL, 0x46c9ca03068c4540ULL, 0x9487a1264c35b2d8ULL, 0xd24e6b254ab9f798ULL, 0x3ee142a35b7c9d65ULL, 0x722e96b86de4ca1fULL, 0x31e453b773628642ULL, 0x3de047a7537a9a6eULL, 0x20eb608b0b40ab2bULL, 0xad90ea7af447d759ULL, 0xf1a40eaa49ff5bb8ULL, 0x221e6678f0445ad2ULL, 0x9285ab2e5c39bcceULL, 0xa060fd9d275d3d87ULL, 0x0000000000000000ULL, 0x6f25b19435defb5aULL, 0x01f403f7f302f6f2ULL, 0x0ef112e3db1cedd5ULL, 0xa194fe6ad45fcb75ULL, 0x1d0b272c583a3145ULL, 0x34e75cbb6b688f5fULL, 0x9f75bcc98f235610ULL, 0x2cef749b2b58b707ULL, 0x5c34e4d0bdb88ce1ULL, 0x5331f5c495a697c6ULL, 0x61d4a377eec2168fULL, 0x6dd0b767ceda0aa3ULL, 0x9786a4224433b5d3ULL, 0x827e9be5d7196755ULL, 0xeaad238e01c964ebULL, 0x1afd2ed3bb34c9a1ULL, 0x7b298da455f6df2eULL, 0x5030f0c09da090cdULL, 0x4d3bd7ecc59aa188ULL, 0xbc9fd9468c65fa30ULL, 0x15f83fc7932ad286ULL, 0x57c6f93f7eae6829ULL, 0x35135f4c986a79adULL, 0x0a061e183014123aULL, 0x0f051114281e1b27ULL, 0x52c5f63366a46134ULL, 0x33115544886677bbULL, 0x9977b6c19f2f5806ULL, 0x847c91edc7156943ULL, 0x8e7a8ff5f7017b79ULL, 0x887885fde70d756fULL, 0x5a36eed8adb482f7ULL, 0x241c6c70e04854c4ULL, 0x4b39dde4d596af9eULL, 0xeb592079f2cb9219ULL, 0x28187860c05048e8ULL, 0xfa5613458ae9bf70ULL, 0xc8b345f6f18d3e39ULL, 0xcdb04afae9873724ULL, 0x6c24b4903dd8fc51ULL, 0x6020a0801dc0e07dULL, 0xcbb240f2f98b3932ULL, 0xab92e072e44bd94fULL, 0xf8a315b671ed4e89ULL, 0x5dc0e7274eba7a13ULL, 0xcc44490d1a85c1d6ULL, 0xa662f79537513391ULL, 0x30105040806070b0ULL, 0xc1b45eeac99f2b08ULL, 0x9184ae2a543fbbc5ULL, 0xc54352112297d4e7ULL, 0xa893e576ec4dde44ULL, 0x5bc2ed2f5eb67405ULL, 0xde4a7f356aa1ebb4ULL, 0xdabd73ce81a9145bULL, 0x8c8f89060c058a80ULL, 0x772d99b475eec302ULL, 0xd9bc76ca89af1350ULL, 0xb99cd64a946ff32dULL, 0xbe6adfb577610bc9ULL, 0xc0405d1d3a9dddfaULL, 0x4ccfd41b3698577aULL, 0xfba210b279eb4982ULL, 0x9d80ba3a7427a7e9ULL, 0xd14f6e2142bff093ULL, 0x211f637cf8425dd9ULL, 0x43cac50f1e864c5dULL, 0xe3aa389239db71daULL, 0xc64257152a91d3ecULL }; static const u64 T2[256] = { 0xd268bad36a01bbb9ULL, 0x4d1954fc66b1e59aULL, 0xbc932f7114cde265ULL, 0xcdb9749c1b512587ULL, 0x510253f557a4f7a2ULL, 0x6bb8d368be03d0d6ULL, 0x6fbdd26bb504d6deULL, 0x29644dd785feb352ULL, 0x5d0d50f04aadfdbaULL, 0x8a26ace9e063cf09ULL, 0x0e838d8a9684091cULL, 0xc679bfdc4d1aa591ULL, 0xddad7090374d3da7ULL, 0x550752f65ca3f1aaULL, 0x52c89ab317e17ba4ULL, 0x2d614cd48ef9b55aULL, 0x8f65ea2320ac4603ULL, 0x73a6d5628411c4e6ULL, 0x66f197a468c255ccULL, 0x63b2d16ea80ddcc6ULL, 0xccff3355d099aa85ULL, 0x590851f341aafbb2ULL, 0x712a5bed0f9cc7e2ULL, 0xa204a6f7ae55f359ULL, 0x5f81de7fc120febeULL, 0x3d7548d8a2e5ad7aULL, 0x9a32a8e5cc7fd729ULL, 0x5ec799b60ae871bcULL, 0x4b90db70e63be096ULL, 0xc8fa3256db9eac8dULL, 0xe651b7c4152295d1ULL, 0xd72bfc19aace32b3ULL, 0xab48e3387393704bULL, 0x42dc9ebf3bfd6384ULL, 0x7eef91ae52d041fcULL, 0x56cd9bb01ce67dacULL, 0xaf4de23b78947643ULL, 0xd66dbbd06106bdb1ULL, 0x195841c3f1da9b32ULL, 0xa5cb6eb2e5177957ULL, 0xae0ba5f2b35cf941ULL, 0x0bc0cb40564b8016ULL, 0xb1da6bbdc20c677fULL, 0x6efb95a27ecc59dcULL, 0xbe1fa1fe9f40e161ULL, 0xeb18f308c3e310cbULL, 0xfe4fb1ce2f3081e1ULL, 0x080a0206160e0c10ULL, 0x17dbcc49675e922eULL, 0x37f3c4513f66a26eULL, 0x74691d27cf534ee8ULL, 0x5044143c9c6c78a0ULL, 0x2be8c3580e73b056ULL, 0x91f263a59a34573fULL, 0x4f95da73ed3ce69eULL, 0x69345de7358ed3d2ULL, 0x613e5fe12380dfc2ULL, 0x578bdc79d72ef2aeULL, 0xe9947d87486e13cfULL, 0x13decd4a6c599426ULL, 0xe19e7f815e601fdfULL, 0x752f5aee049bc1eaULL, 0xadc16cb4f3197547ULL, 0x6d315ce43e89d5daULL, 0xfb0cf704efff08ebULL, 0x98be266a47f2d42dULL, 0xdb24ff1cb7c738abULL, 0x937eed2a11b9543bULL, 0x876fe82536a24a13ULL, 0x4ed39dba26f4699cULL, 0xa1ce6fb1ee107f5fULL, 0x028c8e8f8b8d0304ULL, 0x647d192be34f56c8ULL, 0xba1aa0fd9447e769ULL, 0xe717f00ddeea1ad3ULL, 0x1e978986ba98113cULL, 0x3c330f11692d2278ULL, 0x1c1b070931151238ULL, 0x8629afecfd6ac511ULL, 0xcb30fb109bdb208bULL, 0x2028081858383040ULL, 0x5441153f976b7ea8ULL, 0x34390d177f232e68ULL, 0x1014040c2c1c1820ULL, 0x040501030b070608ULL, 0x8de964acab214507ULL, 0x5b84df7cca27f8b6ULL, 0xc5b3769a0d5f2997ULL, 0xf980798b64720befULL, 0x538edd7adc29f4a6ULL, 0xf4c93d47b2b38ef5ULL, 0x584e163a8a6274b0ULL, 0xfcc33f41a4bd82e5ULL, 0xdceb3759fc85b2a5ULL, 0xa9c46db7f81e734fULL, 0xe0d8384895a890ddULL, 0xde67b9d67708b1a1ULL, 0xd1a273952a4437bfULL, 0x836ae9263da54c1bULL, 0xd4e1355fea8bbeb5ULL, 0x491c55ff6db6e392ULL, 0xd9a871933c4a3bafULL, 0xf18a7b8d727c07ffULL, 0x0a868c899d830f14ULL, 0xd5a77296214331b7ULL, 0x1a928885b19f1734ULL, 0xff09f607e4f80ee3ULL, 0xa8822a7e33d6fc4dULL, 0xf8c63e42afba84edULL, 0x653b5ee22887d9caULL, 0x9cbb27694cf5d225ULL, 0x054346cac0cf890aULL, 0x303c0c1474242860ULL, 0x89ec65afa026430fULL, 0xbdd568b8df056d67ULL, 0x99f861a38c3a5b2fULL, 0x0c0f03051d090a18ULL, 0x23e2c15e187dbc46ULL, 0x411657f97bb8ef82ULL, 0x7fa9d6679918cefeULL, 0x439ad976f035ec86ULL, 0x7d2558e81295cdfaULL, 0x479fd875fb32ea8eULL, 0x85e366aabd2f4917ULL, 0x7bacd764921fc8f6ULL, 0xe8d23a4e83a69ccdULL, 0x07cfc8454b428a0eULL, 0xf0cc3c44b9b488fdULL, 0xcf35fa1390dc2683ULL, 0x62f496a763c553c4ULL, 0xa601a7f4a552f551ULL, 0x5ac298b501ef77b4ULL, 0x977bec291abe5233ULL, 0xda62b8d57c0fb7a9ULL, 0x3bfcc754226fa876ULL, 0x822caeeff66dc319ULL, 0xb9d069bbd4026b6fULL, 0x317a4bddbfeca762ULL, 0x963dabe0d176dd31ULL, 0x9e37a9e6c778d121ULL, 0x81e667a9b6284f1fULL, 0x28220a1e4e363c50ULL, 0x014647c9cbc88f02ULL, 0xef1df20bc8e416c3ULL, 0xee5bb5c2032c99c1ULL, 0x88aa22666beecc0dULL, 0xb356e5324981647bULL, 0x9f71ee2f0cb05e23ULL, 0xc27cbedf461da399ULL, 0xac872b7d38d1fa45ULL, 0x3ebf819ee2a0217cULL, 0x485a1236a67e6c90ULL, 0x36b58398f4ae2d6cULL, 0x6c771b2df5415ad8ULL, 0x38360e12622a2470ULL, 0x8caf236560e9ca05ULL, 0xf306f502f9f104fbULL, 0x094c45cfddc68312ULL, 0x84a5216376e7c615ULL, 0x1fd1ce4f71509e3eULL, 0x397049dba9e2ab72ULL, 0xb09c2c7409c4e87dULL, 0xc33af9168dd52c9bULL, 0xbf59e63754886e63ULL, 0xe254b6c71e2593d9ULL, 0xa088287825d8f05dULL, 0x5c4b1739816572b8ULL, 0x32b0829bffa92b64ULL, 0x68721a2efe465cd0ULL, 0x169d8b80ac961d2cULL, 0xdf21fe1fbcc03ea3ULL, 0x12988a83a7911b24ULL, 0x242d091b533f3648ULL, 0x03cac94640458c06ULL, 0x26a18794d8b2354cULL, 0x256b4ed298f7b94aULL, 0xa342e13e659d7c5bULL, 0xb8962e721fcae46dULL, 0xb753e43142866273ULL, 0xa747e03d6e9a7a53ULL, 0x8b60eb202bab400bULL, 0x7aea90ad59d747f4ULL, 0xaa0ea4f1b85bff49ULL, 0x78661e22d25a44f0ULL, 0x2eab8592cebc395cULL, 0x9dfd60a0873d5d27ULL, 0x0000000000000000ULL, 0x94b1256f5afbde35ULL, 0xf703f401f2f602f3ULL, 0xe312f10ed5ed1cdbULL, 0x6afe94a175cb5fd4ULL, 0x2c270b1d45313a58ULL, 0xbb5ce7345f8f686bULL, 0xc9bc759f1056238fULL, 0x9b74ef2c07b7582bULL, 0xd0e4345ce18cb8bdULL, 0xc4f53153c697a695ULL, 0x77a3d4618f16c2eeULL, 0x67b7d06da30adaceULL, 0x22a48697d3b53344ULL, 0xe59b7e82556719d7ULL, 0x8e23adeaeb64c901ULL, 0xd32efd1aa1c934bbULL, 0xa48d297b2edff655ULL, 0xc0f03050cd90a09dULL, 0xecd73b4d88a19ac5ULL, 0x46d99fbc30fa658cULL, 0xc73ff81586d22a93ULL, 0x3ff9c6572968ae7eULL, 0x4c5f1335ad796a98ULL, 0x181e060a3a121430ULL, 0x1411050f271b1e28ULL, 0x33f6c5523461a466ULL, 0x44551133bb776688ULL, 0xc1b6779906582f9fULL, 0xed917c84436915c7ULL, 0xf58f7a8e797b01f7ULL, 0xfd8578886f750de7ULL, 0xd8ee365af782b4adULL, 0x706c1c24c45448e0ULL, 0xe4dd394b9eaf96d5ULL, 0x792059eb1992cbf2ULL, 0x60781828e84850c0ULL, 0x451356fa70bfe98aULL, 0xf645b3c8393e8df1ULL, 0xfa4ab0cd243787e9ULL, 0x90b4246c51fcd83dULL, 0x80a020607de0c01dULL, 0xf240b2cb32398bf9ULL, 0x72e092ab4fd94be4ULL, 0xb615a3f8894eed71ULL, 0x27e7c05d137aba4eULL, 0x0d4944ccd6c1851aULL, 0x95f762a691335137ULL, 0x40501030b0706080ULL, 0xea5eb4c1082b9fc9ULL, 0x2aae8491c5bb3f54ULL, 0x115243c5e7d49722ULL, 0x76e593a844de4decULL, 0x2fedc25b0574b65eULL, 0x357f4adeb4eba16aULL, 0xce73bdda5b14a981ULL, 0x06898f8c808a050cULL, 0xb4992d7702c3ee75ULL, 0xca76bcd95013af89ULL, 0x4ad69cb92df36f94ULL, 0xb5df6abec90b6177ULL, 0x1d5d40c0fadd9d3aULL, 0x1bd4cf4c7a579836ULL, 0xb210a2fb8249eb79ULL, 0x3aba809de9a72774ULL, 0x216e4fd193f0bf42ULL, 0x7c631f21d95d42f8ULL, 0x0fc5ca435d4c861eULL, 0x9238aae3da71db39ULL, 0x155742c6ecd3912aULL }; static const u64 T3[256] = { 0x68d2d3ba016ab9bbULL, 0x194dfc54b1669ae5ULL, 0x93bc712fcd1465e2ULL, 0xb9cd9c74511b8725ULL, 0x0251f553a457a2f7ULL, 0xb86b68d303bed6d0ULL, 0xbd6f6bd204b5ded6ULL, 0x6429d74dfe8552b3ULL, 0x0d5df050ad4abafdULL, 0x268ae9ac63e009cfULL, 0x830e8a8d84961c09ULL, 0x79c6dcbf1a4d91a5ULL, 0xaddd90704d37a73dULL, 0x0755f652a35caaf1ULL, 0xc852b39ae117a47bULL, 0x612dd44cf98e5ab5ULL, 0x658f23eaac200346ULL, 0xa67362d51184e6c4ULL, 0xf166a497c268cc55ULL, 0xb2636ed10da8c6dcULL, 0xffcc553399d085aaULL, 0x0859f351aa41b2fbULL, 0x2a71ed5b9c0fe2c7ULL, 0x04a2f7a655ae59f3ULL, 0x815f7fde20c1befeULL, 0x753dd848e5a27aadULL, 0x329ae5a87fcc29d7ULL, 0xc75eb699e80abc71ULL, 0x904b70db3be696e0ULL, 0xfac856329edb8dacULL, 0x51e6c4b72215d195ULL, 0x2bd719fcceaab332ULL, 0x48ab38e393734b70ULL, 0xdc42bf9efd3b8463ULL, 0xef7eae91d052fc41ULL, 0xcd56b09be61cac7dULL, 0x4daf3be294784376ULL, 0x6dd6d0bb0661b1bdULL, 0x5819c341daf1329bULL, 0xcba5b26e17e55779ULL, 0x0baef2a55cb341f9ULL, 0xc00b40cb4b561680ULL, 0xdab1bd6b0cc27f67ULL, 0xfb6ea295cc7edc59ULL, 0x1fbefea1409f61e1ULL, 0x18eb08f3e3c3cb10ULL, 0x4ffeceb1302fe181ULL, 0x0a0806020e16100cULL, 0xdb1749cc5e672e92ULL, 0xf33751c4663f6ea2ULL, 0x6974271d53cfe84eULL, 0x44503c146c9ca078ULL, 0xe82b58c3730e56b0ULL, 0xf291a563349a3f57ULL, 0x954f73da3ced9ee6ULL, 0x3469e75d8e35d2d3ULL, 0x3e61e15f8023c2dfULL, 0x8b5779dc2ed7aef2ULL, 0x94e9877d6e48cf13ULL, 0xde134acd596c2694ULL, 0x9ee1817f605edf1fULL, 0x2f75ee5a9b04eac1ULL, 0xc1adb46c19f34775ULL, 0x316de45c893edad5ULL, 0x0cfb04f7ffefeb08ULL, 0xbe986a26f2472dd4ULL, 0x24db1cffc7b7ab38ULL, 0x7e932aedb9113b54ULL, 0x6f8725e8a236134aULL, 0xd34eba9df4269c69ULL, 0xcea1b16f10ee5f7fULL, 0x8c028f8e8d8b0403ULL, 0x7d642b194fe3c856ULL, 0x1abafda0479469e7ULL, 0x17e70df0eaded31aULL, 0x971e868998ba3c11ULL, 0x333c110f2d697822ULL, 0x1b1c090715313812ULL, 0x2986ecaf6afd11c5ULL, 0x30cb10fbdb9b8b20ULL, 0x2820180838584030ULL, 0x41543f156b97a87eULL, 0x3934170d237f682eULL, 0x14100c041c2c2018ULL, 0x05040301070b0806ULL, 0xe98dac6421ab0745ULL, 0x845b7cdf27cab6f8ULL, 0xb3c59a765f0d9729ULL, 0x80f98b797264ef0bULL, 0x8e537add29dca6f4ULL, 0xc9f4473db3b2f58eULL, 0x4e583a16628ab074ULL, 0xc3fc413fbda4e582ULL, 0xebdc593785fca5b2ULL, 0xc4a9b76d1ef84f73ULL, 0xd8e04838a895dd90ULL, 0x67ded6b90877a1b1ULL, 0xa2d19573442abf37ULL, 0x6a8326e9a53d1b4cULL, 0xe1d45f358beab5beULL, 0x1c49ff55b66d92e3ULL, 0xa8d993714a3caf3bULL, 0x8af18d7b7c72ff07ULL, 0x860a898c839d140fULL, 0xa7d596724321b731ULL, 0x921a85889fb13417ULL, 0x09ff07f6f8e4e30eULL, 0x82a87e2ad6334dfcULL, 0xc6f8423ebaafed84ULL, 0x3b65e25e8728cad9ULL, 0xbb9c6927f54c25d2ULL, 0x4305ca46cfc00a89ULL, 0x3c30140c24746028ULL, 0xec89af6526a00f43ULL, 0xd5bdb86805df676dULL, 0xf899a3613a8c2f5bULL, 0x0f0c0503091d180aULL, 0xe2235ec17d1846bcULL, 0x1641f957b87b82efULL, 0xa97f67d61899feceULL, 0x9a4376d935f086ecULL, 0x257de8589512facdULL, 0x9f4775d832fb8eeaULL, 0xe385aa662fbd1749ULL, 0xac7b64d71f92f6c8ULL, 0xd2e84e3aa683cd9cULL, 0xcf0745c8424b0e8aULL, 0xccf0443cb4b9fd88ULL, 0x35cf13fadc908326ULL, 0xf462a796c563c453ULL, 0x01a6f4a752a551f5ULL, 0xc25ab598ef01b477ULL, 0x7b9729ecbe1a3352ULL, 0x62dad5b80f7ca9b7ULL, 0xfc3b54c76f2276a8ULL, 0x2c82efae6df619c3ULL, 0xd0b9bb6902d46f6bULL, 0x7a31dd4becbf62a7ULL, 0x3d96e0ab76d131ddULL, 0x379ee6a978c721d1ULL, 0xe681a96728b61f4fULL, 0x22281e0a364e503cULL, 0x4601c947c8cb028fULL, 0x1def0bf2e4c8c316ULL, 0x5beec2b52c03c199ULL, 0xaa886622ee6b0dccULL, 0x56b332e581497b64ULL, 0x719f2feeb00c235eULL, 0x7cc2dfbe1d4699a3ULL, 0x87ac7d2bd13845faULL, 0xbf3e9e81a0e27c21ULL, 0x5a4836127ea6906cULL, 0xb5369883aef46c2dULL, 0x776c2d1b41f5d85aULL, 0x3638120e2a627024ULL, 0xaf8c6523e96005caULL, 0x06f302f5f1f9fb04ULL, 0x4c09cf45c6dd1283ULL, 0xa5846321e77615c6ULL, 0xd11f4fce50713e9eULL, 0x7039db49e2a972abULL, 0x9cb0742cc4097de8ULL, 0x3ac316f9d58d9b2cULL, 0x59bf37e68854636eULL, 0x54e2c7b6251ed993ULL, 0x88a07828d8255df0ULL, 0x4b5c39176581b872ULL, 0xb0329b82a9ff642bULL, 0x72682e1a46fed05cULL, 0x9d16808b96ac2c1dULL, 0x21df1ffec0bca33eULL, 0x9812838a91a7241bULL, 0x2d241b093f534836ULL, 0xca0346c94540068cULL, 0xa1269487b2d84c35ULL, 0x6b25d24ef7984ab9ULL, 0x42a33ee19d655b7cULL, 0x96b8722eca1f6de4ULL, 0x53b731e486427362ULL, 0x47a73de09a6e537aULL, 0x608b20ebab2b0b40ULL, 0xea7aad90d759f447ULL, 0x0eaaf1a45bb849ffULL, 0x6678221e5ad2f044ULL, 0xab2e9285bcce5c39ULL, 0xfd9da0603d87275dULL, 0x0000000000000000ULL, 0xb1946f25fb5a35deULL, 0x03f701f4f6f2f302ULL, 0x12e30ef1edd5db1cULL, 0xfe6aa194cb75d45fULL, 0x272c1d0b3145583aULL, 0x5cbb34e78f5f6b68ULL, 0xbcc99f7556108f23ULL, 0x749b2cefb7072b58ULL, 0xe4d05c348ce1bdb8ULL, 0xf5c4533197c695a6ULL, 0xa37761d4168feec2ULL, 0xb7676dd00aa3cedaULL, 0xa4229786b5d34433ULL, 0x9be5827e6755d719ULL, 0x238eeaad64eb01c9ULL, 0x2ed31afdc9a1bb34ULL, 0x8da47b29df2e55f6ULL, 0xf0c0503090cd9da0ULL, 0xd7ec4d3ba188c59aULL, 0xd946bc9ffa308c65ULL, 0x3fc715f8d286932aULL, 0xf93f57c668297eaeULL, 0x5f4c351379ad986aULL, 0x1e180a06123a3014ULL, 0x11140f051b27281eULL, 0xf63352c5613466a4ULL, 0x5544331177bb8866ULL, 0xb6c1997758069f2fULL, 0x91ed847c6943c715ULL, 0x8ff58e7a7b79f701ULL, 0x85fd8878756fe70dULL, 0xeed85a3682f7adb4ULL, 0x6c70241c54c4e048ULL, 0xdde44b39af9ed596ULL, 0x2079eb599219f2cbULL, 0x7860281848e8c050ULL, 0x1345fa56bf708ae9ULL, 0x45f6c8b33e39f18dULL, 0x4afacdb03724e987ULL, 0xb4906c24fc513dd8ULL, 0xa0806020e07d1dc0ULL, 0x40f2cbb23932f98bULL, 0xe072ab92d94fe44bULL, 0x15b6f8a34e8971edULL, 0xe7275dc07a134ebaULL, 0x490dcc44c1d61a85ULL, 0xf795a66233913751ULL, 0x5040301070b08060ULL, 0x5eeac1b42b08c99fULL, 0xae2a9184bbc5543fULL, 0x5211c543d4e72297ULL, 0xe576a893de44ec4dULL, 0xed2f5bc274055eb6ULL, 0x7f35de4aebb46aa1ULL, 0x73cedabd145b81a9ULL, 0x89068c8f8a800c05ULL, 0x99b4772dc30275eeULL, 0x76cad9bc135089afULL, 0xd64ab99cf32d946fULL, 0xdfb5be6a0bc97761ULL, 0x5d1dc040ddfa3a9dULL, 0xd41b4ccf577a3698ULL, 0x10b2fba2498279ebULL, 0xba3a9d80a7e97427ULL, 0x6e21d14ff09342bfULL, 0x637c211f5dd9f842ULL, 0xc50f43ca4c5d1e86ULL, 0x3892e3aa71da39dbULL, 0x5715c642d3ec2a91ULL }; static const u64 T4[256] = { 0xbbb96a01bad3d268ULL, 0xe59a66b154fc4d19ULL, 0xe26514cd2f71bc93ULL, 0x25871b51749ccdb9ULL, 0xf7a257a453f55102ULL, 0xd0d6be03d3686bb8ULL, 0xd6deb504d26b6fbdULL, 0xb35285fe4dd72964ULL, 0xfdba4aad50f05d0dULL, 0xcf09e063ace98a26ULL, 0x091c96848d8a0e83ULL, 0xa5914d1abfdcc679ULL, 0x3da7374d7090ddadULL, 0xf1aa5ca352f65507ULL, 0x7ba417e19ab352c8ULL, 0xb55a8ef94cd42d61ULL, 0x460320acea238f65ULL, 0xc4e68411d56273a6ULL, 0x55cc68c297a466f1ULL, 0xdcc6a80dd16e63b2ULL, 0xaa85d0993355ccffULL, 0xfbb241aa51f35908ULL, 0xc7e20f9c5bed712aULL, 0xf359ae55a6f7a204ULL, 0xfebec120de7f5f81ULL, 0xad7aa2e548d83d75ULL, 0xd729cc7fa8e59a32ULL, 0x71bc0ae899b65ec7ULL, 0xe096e63bdb704b90ULL, 0xac8ddb9e3256c8faULL, 0x95d11522b7c4e651ULL, 0x32b3aacefc19d72bULL, 0x704b7393e338ab48ULL, 0x63843bfd9ebf42dcULL, 0x41fc52d091ae7eefULL, 0x7dac1ce69bb056cdULL, 0x76437894e23baf4dULL, 0xbdb16106bbd0d66dULL, 0x9b32f1da41c31958ULL, 0x7957e5176eb2a5cbULL, 0xf941b35ca5f2ae0bULL, 0x8016564bcb400bc0ULL, 0x677fc20c6bbdb1daULL, 0x59dc7ecc95a26efbULL, 0xe1619f40a1febe1fULL, 0x10cbc3e3f308eb18ULL, 0x81e12f30b1cefe4fULL, 0x0c10160e0206080aULL, 0x922e675ecc4917dbULL, 0xa26e3f66c45137f3ULL, 0x4ee8cf531d277469ULL, 0x78a09c6c143c5044ULL, 0xb0560e73c3582be8ULL, 0x573f9a3463a591f2ULL, 0xe69eed3cda734f95ULL, 0xd3d2358e5de76934ULL, 0xdfc223805fe1613eULL, 0xf2aed72edc79578bULL, 0x13cf486e7d87e994ULL, 0x94266c59cd4a13deULL, 0x1fdf5e607f81e19eULL, 0xc1ea049b5aee752fULL, 0x7547f3196cb4adc1ULL, 0xd5da3e895ce46d31ULL, 0x08ebeffff704fb0cULL, 0xd42d47f2266a98beULL, 0x38abb7c7ff1cdb24ULL, 0x543b11b9ed2a937eULL, 0x4a1336a2e825876fULL, 0x699c26f49dba4ed3ULL, 0x7f5fee106fb1a1ceULL, 0x03048b8d8e8f028cULL, 0x56c8e34f192b647dULL, 0xe7699447a0fdba1aULL, 0x1ad3deeaf00de717ULL, 0x113cba9889861e97ULL, 0x2278692d0f113c33ULL, 0x1238311507091c1bULL, 0xc511fd6aafec8629ULL, 0x208b9bdbfb10cb30ULL, 0x3040583808182028ULL, 0x7ea8976b153f5441ULL, 0x2e687f230d173439ULL, 0x18202c1c040c1014ULL, 0x06080b0701030405ULL, 0x4507ab2164ac8de9ULL, 0xf8b6ca27df7c5b84ULL, 0x29970d5f769ac5b3ULL, 0x0bef6472798bf980ULL, 0xf4a6dc29dd7a538eULL, 0x8ef5b2b33d47f4c9ULL, 0x74b08a62163a584eULL, 0x82e5a4bd3f41fcc3ULL, 0xb2a5fc853759dcebULL, 0x734ff81e6db7a9c4ULL, 0x90dd95a83848e0d8ULL, 0xb1a17708b9d6de67ULL, 0x37bf2a447395d1a2ULL, 0x4c1b3da5e926836aULL, 0xbeb5ea8b355fd4e1ULL, 0xe3926db655ff491cULL, 0x3baf3c4a7193d9a8ULL, 0x07ff727c7b8df18aULL, 0x0f149d838c890a86ULL, 0x31b721437296d5a7ULL, 0x1734b19f88851a92ULL, 0x0ee3e4f8f607ff09ULL, 0xfc4d33d62a7ea882ULL, 0x84edafba3e42f8c6ULL, 0xd9ca28875ee2653bULL, 0xd2254cf527699cbbULL, 0x890ac0cf46ca0543ULL, 0x286074240c14303cULL, 0x430fa02665af89ecULL, 0x6d67df0568b8bdd5ULL, 0x5b2f8c3a61a399f8ULL, 0x0a181d0903050c0fULL, 0xbc46187dc15e23e2ULL, 0xef827bb857f94116ULL, 0xcefe9918d6677fa9ULL, 0xec86f035d976439aULL, 0xcdfa129558e87d25ULL, 0xea8efb32d875479fULL, 0x4917bd2f66aa85e3ULL, 0xc8f6921fd7647bacULL, 0x9ccd83a63a4ee8d2ULL, 0x8a0e4b42c84507cfULL, 0x88fdb9b43c44f0ccULL, 0x268390dcfa13cf35ULL, 0x53c463c596a762f4ULL, 0xf551a552a7f4a601ULL, 0x77b401ef98b55ac2ULL, 0x52331abeec29977bULL, 0xb7a97c0fb8d5da62ULL, 0xa876226fc7543bfcULL, 0xc319f66daeef822cULL, 0x6b6fd40269bbb9d0ULL, 0xa762bfec4bdd317aULL, 0xdd31d176abe0963dULL, 0xd121c778a9e69e37ULL, 0x4f1fb62867a981e6ULL, 0x3c504e360a1e2822ULL, 0x8f02cbc847c90146ULL, 0x16c3c8e4f20bef1dULL, 0x99c1032cb5c2ee5bULL, 0xcc0d6bee226688aaULL, 0x647b4981e532b356ULL, 0x5e230cb0ee2f9f71ULL, 0xa399461dbedfc27cULL, 0xfa4538d12b7dac87ULL, 0x217ce2a0819e3ebfULL, 0x6c90a67e1236485aULL, 0x2d6cf4ae839836b5ULL, 0x5ad8f5411b2d6c77ULL, 0x2470622a0e123836ULL, 0xca0560e923658cafULL, 0x04fbf9f1f502f306ULL, 0x8312ddc645cf094cULL, 0xc61576e7216384a5ULL, 0x9e3e7150ce4f1fd1ULL, 0xab72a9e249db3970ULL, 0xe87d09c42c74b09cULL, 0x2c9b8dd5f916c33aULL, 0x6e635488e637bf59ULL, 0x93d91e25b6c7e254ULL, 0xf05d25d82878a088ULL, 0x72b8816517395c4bULL, 0x2b64ffa9829b32b0ULL, 0x5cd0fe461a2e6872ULL, 0x1d2cac968b80169dULL, 0x3ea3bcc0fe1fdf21ULL, 0x1b24a7918a831298ULL, 0x3648533f091b242dULL, 0x8c064045c94603caULL, 0x354cd8b2879426a1ULL, 0xb94a98f74ed2256bULL, 0x7c5b659de13ea342ULL, 0xe46d1fca2e72b896ULL, 0x62734286e431b753ULL, 0x7a536e9ae03da747ULL, 0x400b2babeb208b60ULL, 0x47f459d790ad7aeaULL, 0xff49b85ba4f1aa0eULL, 0x44f0d25a1e227866ULL, 0x395ccebc85922eabULL, 0x5d27873d60a09dfdULL, 0x0000000000000000ULL, 0xde355afb256f94b1ULL, 0x02f3f2f6f401f703ULL, 0x1cdbd5edf10ee312ULL, 0x5fd475cb94a16afeULL, 0x3a5845310b1d2c27ULL, 0x686b5f8fe734bb5cULL, 0x238f1056759fc9bcULL, 0x582b07b7ef2c9b74ULL, 0xb8bde18c345cd0e4ULL, 0xa695c6973153c4f5ULL, 0xc2ee8f16d46177a3ULL, 0xdacea30ad06d67b7ULL, 0x3344d3b5869722a4ULL, 0x19d755677e82e59bULL, 0xc901eb64adea8e23ULL, 0x34bba1c9fd1ad32eULL, 0xf6552edf297ba48dULL, 0xa09dcd903050c0f0ULL, 0x9ac588a13b4decd7ULL, 0x658c30fa9fbc46d9ULL, 0x2a9386d2f815c73fULL, 0xae7e2968c6573ff9ULL, 0x6a98ad7913354c5fULL, 0x14303a12060a181eULL, 0x1e28271b050f1411ULL, 0xa4663461c55233f6ULL, 0x6688bb7711334455ULL, 0x2f9f06587799c1b6ULL, 0x15c743697c84ed91ULL, 0x01f7797b7a8ef58fULL, 0x0de76f757888fd85ULL, 0xb4adf782365ad8eeULL, 0x48e0c4541c24706cULL, 0x96d59eaf394be4ddULL, 0xcbf2199259eb7920ULL, 0x50c0e84818286078ULL, 0xe98a70bf56fa4513ULL, 0x8df1393eb3c8f645ULL, 0x87e92437b0cdfa4aULL, 0xd83d51fc246c90b4ULL, 0xc01d7de0206080a0ULL, 0x8bf93239b2cbf240ULL, 0x4be44fd992ab72e0ULL, 0xed71894ea3f8b615ULL, 0xba4e137ac05d27e7ULL, 0x851ad6c144cc0d49ULL, 0x5137913362a695f7ULL, 0x6080b07010304050ULL, 0x9fc9082bb4c1ea5eULL, 0x3f54c5bb84912aaeULL, 0x9722e7d443c51152ULL, 0x4dec44de93a876e5ULL, 0xb65e0574c25b2fedULL, 0xa16ab4eb4ade357fULL, 0xa9815b14bddace73ULL, 0x050c808a8f8c0689ULL, 0xee7502c32d77b499ULL, 0xaf895013bcd9ca76ULL, 0x6f942df39cb94ad6ULL, 0x6177c90b6abeb5dfULL, 0x9d3afadd40c01d5dULL, 0x98367a57cf4c1bd4ULL, 0xeb798249a2fbb210ULL, 0x2774e9a7809d3abaULL, 0xbf4293f04fd1216eULL, 0x42f8d95d1f217c63ULL, 0x861e5d4cca430fc5ULL, 0xdb39da71aae39238ULL, 0x912aecd342c61557ULL }; static const u64 T5[256] = { 0xb9bb016ad3ba68d2ULL, 0x9ae5b166fc54194dULL, 0x65e2cd14712f93bcULL, 0x8725511b9c74b9cdULL, 0xa2f7a457f5530251ULL, 0xd6d003be68d3b86bULL, 0xded604b56bd2bd6fULL, 0x52b3fe85d74d6429ULL, 0xbafdad4af0500d5dULL, 0x09cf63e0e9ac268aULL, 0x1c0984968a8d830eULL, 0x91a51a4ddcbf79c6ULL, 0xa73d4d379070adddULL, 0xaaf1a35cf6520755ULL, 0xa47be117b39ac852ULL, 0x5ab5f98ed44c612dULL, 0x0346ac2023ea658fULL, 0xe6c4118462d5a673ULL, 0xcc55c268a497f166ULL, 0xc6dc0da86ed1b263ULL, 0x85aa99d05533ffccULL, 0xb2fbaa41f3510859ULL, 0xe2c79c0fed5b2a71ULL, 0x59f355aef7a604a2ULL, 0xbefe20c17fde815fULL, 0x7aade5a2d848753dULL, 0x29d77fcce5a8329aULL, 0xbc71e80ab699c75eULL, 0x96e03be670db904bULL, 0x8dac9edb5632fac8ULL, 0xd1952215c4b751e6ULL, 0xb332ceaa19fc2bd7ULL, 0x4b70937338e348abULL, 0x8463fd3bbf9edc42ULL, 0xfc41d052ae91ef7eULL, 0xac7de61cb09bcd56ULL, 0x437694783be24dafULL, 0xb1bd0661d0bb6dd6ULL, 0x329bdaf1c3415819ULL, 0x577917e5b26ecba5ULL, 0x41f95cb3f2a50baeULL, 0x16804b5640cbc00bULL, 0x7f670cc2bd6bdab1ULL, 0xdc59cc7ea295fb6eULL, 0x61e1409ffea11fbeULL, 0xcb10e3c308f318ebULL, 0xe181302fceb14ffeULL, 0x100c0e1606020a08ULL, 0x2e925e6749ccdb17ULL, 0x6ea2663f51c4f337ULL, 0xe84e53cf271d6974ULL, 0xa0786c9c3c144450ULL, 0x56b0730e58c3e82bULL, 0x3f57349aa563f291ULL, 0x9ee63ced73da954fULL, 0xd2d38e35e75d3469ULL, 0xc2df8023e15f3e61ULL, 0xaef22ed779dc8b57ULL, 0xcf136e48877d94e9ULL, 0x2694596c4acdde13ULL, 0xdf1f605e817f9ee1ULL, 0xeac19b04ee5a2f75ULL, 0x477519f3b46cc1adULL, 0xdad5893ee45c316dULL, 0xeb08ffef04f70cfbULL, 0x2dd4f2476a26be98ULL, 0xab38c7b71cff24dbULL, 0x3b54b9112aed7e93ULL, 0x134aa23625e86f87ULL, 0x9c69f426ba9dd34eULL, 0x5f7f10eeb16fcea1ULL, 0x04038d8b8f8e8c02ULL, 0xc8564fe32b197d64ULL, 0x69e74794fda01abaULL, 0xd31aeade0df017e7ULL, 0x3c1198ba8689971eULL, 0x78222d69110f333cULL, 0x3812153109071b1cULL, 0x11c56afdecaf2986ULL, 0x8b20db9b10fb30cbULL, 0x4030385818082820ULL, 0xa87e6b973f154154ULL, 0x682e237f170d3934ULL, 0x20181c2c0c041410ULL, 0x0806070b03010504ULL, 0x074521abac64e98dULL, 0xb6f827ca7cdf845bULL, 0x97295f0d9a76b3c5ULL, 0xef0b72648b7980f9ULL, 0xa6f429dc7add8e53ULL, 0xf58eb3b2473dc9f4ULL, 0xb074628a3a164e58ULL, 0xe582bda4413fc3fcULL, 0xa5b285fc5937ebdcULL, 0x4f731ef8b76dc4a9ULL, 0xdd90a8954838d8e0ULL, 0xa1b10877d6b967deULL, 0xbf37442a9573a2d1ULL, 0x1b4ca53d26e96a83ULL, 0xb5be8bea5f35e1d4ULL, 0x92e3b66dff551c49ULL, 0xaf3b4a3c9371a8d9ULL, 0xff077c728d7b8af1ULL, 0x140f839d898c860aULL, 0xb73143219672a7d5ULL, 0x34179fb18588921aULL, 0xe30ef8e407f609ffULL, 0x4dfcd6337e2a82a8ULL, 0xed84baaf423ec6f8ULL, 0xcad98728e25e3b65ULL, 0x25d2f54c6927bb9cULL, 0x0a89cfc0ca464305ULL, 0x60282474140c3c30ULL, 0x0f4326a0af65ec89ULL, 0x676d05dfb868d5bdULL, 0x2f5b3a8ca361f899ULL, 0x180a091d05030f0cULL, 0x46bc7d185ec1e223ULL, 0x82efb87bf9571641ULL, 0xfece189967d6a97fULL, 0x86ec35f076d99a43ULL, 0xfacd9512e858257dULL, 0x8eea32fb75d89f47ULL, 0x17492fbdaa66e385ULL, 0xf6c81f9264d7ac7bULL, 0xcd9ca6834e3ad2e8ULL, 0x0e8a424b45c8cf07ULL, 0xfd88b4b9443cccf0ULL, 0x8326dc9013fa35cfULL, 0xc453c563a796f462ULL, 0x51f552a5f4a701a6ULL, 0xb477ef01b598c25aULL, 0x3352be1a29ec7b97ULL, 0xa9b70f7cd5b862daULL, 0x76a86f2254c7fc3bULL, 0x19c36df6efae2c82ULL, 0x6f6b02d4bb69d0b9ULL, 0x62a7ecbfdd4b7a31ULL, 0x31dd76d1e0ab3d96ULL, 0x21d178c7e6a9379eULL, 0x1f4f28b6a967e681ULL, 0x503c364e1e0a2228ULL, 0x028fc8cbc9474601ULL, 0xc316e4c80bf21defULL, 0xc1992c03c2b55beeULL, 0x0dccee6b6622aa88ULL, 0x7b64814932e556b3ULL, 0x235eb00c2fee719fULL, 0x99a31d46dfbe7cc2ULL, 0x45fad1387d2b87acULL, 0x7c21a0e29e81bf3eULL, 0x906c7ea636125a48ULL, 0x6c2daef49883b536ULL, 0xd85a41f52d1b776cULL, 0x70242a62120e3638ULL, 0x05cae9606523af8cULL, 0xfb04f1f902f506f3ULL, 0x1283c6ddcf454c09ULL, 0x15c6e7766321a584ULL, 0x3e9e50714fced11fULL, 0x72abe2a9db497039ULL, 0x7de8c409742c9cb0ULL, 0x9b2cd58d16f93ac3ULL, 0x636e885437e659bfULL, 0xd993251ec7b654e2ULL, 0x5df0d825782888a0ULL, 0xb872658139174b5cULL, 0x642ba9ff9b82b032ULL, 0xd05c46fe2e1a7268ULL, 0x2c1d96ac808b9d16ULL, 0xa33ec0bc1ffe21dfULL, 0x241b91a7838a9812ULL, 0x48363f531b092d24ULL, 0x068c454046c9ca03ULL, 0x4c35b2d89487a126ULL, 0x4ab9f798d24e6b25ULL, 0x5b7c9d653ee142a3ULL, 0x6de4ca1f722e96b8ULL, 0x7362864231e453b7ULL, 0x537a9a6e3de047a7ULL, 0x0b40ab2b20eb608bULL, 0xf447d759ad90ea7aULL, 0x49ff5bb8f1a40eaaULL, 0xf0445ad2221e6678ULL, 0x5c39bcce9285ab2eULL, 0x275d3d87a060fd9dULL, 0x0000000000000000ULL, 0x35defb5a6f25b194ULL, 0xf302f6f201f403f7ULL, 0xdb1cedd50ef112e3ULL, 0xd45fcb75a194fe6aULL, 0x583a31451d0b272cULL, 0x6b688f5f34e75cbbULL, 0x8f2356109f75bcc9ULL, 0x2b58b7072cef749bULL, 0xbdb88ce15c34e4d0ULL, 0x95a697c65331f5c4ULL, 0xeec2168f61d4a377ULL, 0xceda0aa36dd0b767ULL, 0x4433b5d39786a422ULL, 0xd7196755827e9be5ULL, 0x01c964ebeaad238eULL, 0xbb34c9a11afd2ed3ULL, 0x55f6df2e7b298da4ULL, 0x9da090cd5030f0c0ULL, 0xc59aa1884d3bd7ecULL, 0x8c65fa30bc9fd946ULL, 0x932ad28615f83fc7ULL, 0x7eae682957c6f93fULL, 0x986a79ad35135f4cULL, 0x3014123a0a061e18ULL, 0x281e1b270f051114ULL, 0x66a4613452c5f633ULL, 0x886677bb33115544ULL, 0x9f2f58069977b6c1ULL, 0xc7156943847c91edULL, 0xf7017b798e7a8ff5ULL, 0xe70d756f887885fdULL, 0xadb482f75a36eed8ULL, 0xe04854c4241c6c70ULL, 0xd596af9e4b39dde4ULL, 0xf2cb9219eb592079ULL, 0xc05048e828187860ULL, 0x8ae9bf70fa561345ULL, 0xf18d3e39c8b345f6ULL, 0xe9873724cdb04afaULL, 0x3dd8fc516c24b490ULL, 0x1dc0e07d6020a080ULL, 0xf98b3932cbb240f2ULL, 0xe44bd94fab92e072ULL, 0x71ed4e89f8a315b6ULL, 0x4eba7a135dc0e727ULL, 0x1a85c1d6cc44490dULL, 0x37513391a662f795ULL, 0x806070b030105040ULL, 0xc99f2b08c1b45eeaULL, 0x543fbbc59184ae2aULL, 0x2297d4e7c5435211ULL, 0xec4dde44a893e576ULL, 0x5eb674055bc2ed2fULL, 0x6aa1ebb4de4a7f35ULL, 0x81a9145bdabd73ceULL, 0x0c058a808c8f8906ULL, 0x75eec302772d99b4ULL, 0x89af1350d9bc76caULL, 0x946ff32db99cd64aULL, 0x77610bc9be6adfb5ULL, 0x3a9dddfac0405d1dULL, 0x3698577a4ccfd41bULL, 0x79eb4982fba210b2ULL, 0x7427a7e99d80ba3aULL, 0x42bff093d14f6e21ULL, 0xf8425dd9211f637cULL, 0x1e864c5d43cac50fULL, 0x39db71dae3aa3892ULL, 0x2a91d3ecc6425715ULL }; static const u64 T6[256] = { 0x6a01bbb9d268bad3ULL, 0x66b1e59a4d1954fcULL, 0x14cde265bc932f71ULL, 0x1b512587cdb9749cULL, 0x57a4f7a2510253f5ULL, 0xbe03d0d66bb8d368ULL, 0xb504d6de6fbdd26bULL, 0x85feb35229644dd7ULL, 0x4aadfdba5d0d50f0ULL, 0xe063cf098a26ace9ULL, 0x9684091c0e838d8aULL, 0x4d1aa591c679bfdcULL, 0x374d3da7ddad7090ULL, 0x5ca3f1aa550752f6ULL, 0x17e17ba452c89ab3ULL, 0x8ef9b55a2d614cd4ULL, 0x20ac46038f65ea23ULL, 0x8411c4e673a6d562ULL, 0x68c255cc66f197a4ULL, 0xa80ddcc663b2d16eULL, 0xd099aa85ccff3355ULL, 0x41aafbb2590851f3ULL, 0x0f9cc7e2712a5bedULL, 0xae55f359a204a6f7ULL, 0xc120febe5f81de7fULL, 0xa2e5ad7a3d7548d8ULL, 0xcc7fd7299a32a8e5ULL, 0x0ae871bc5ec799b6ULL, 0xe63be0964b90db70ULL, 0xdb9eac8dc8fa3256ULL, 0x152295d1e651b7c4ULL, 0xaace32b3d72bfc19ULL, 0x7393704bab48e338ULL, 0x3bfd638442dc9ebfULL, 0x52d041fc7eef91aeULL, 0x1ce67dac56cd9bb0ULL, 0x78947643af4de23bULL, 0x6106bdb1d66dbbd0ULL, 0xf1da9b32195841c3ULL, 0xe5177957a5cb6eb2ULL, 0xb35cf941ae0ba5f2ULL, 0x564b80160bc0cb40ULL, 0xc20c677fb1da6bbdULL, 0x7ecc59dc6efb95a2ULL, 0x9f40e161be1fa1feULL, 0xc3e310cbeb18f308ULL, 0x2f3081e1fe4fb1ceULL, 0x160e0c10080a0206ULL, 0x675e922e17dbcc49ULL, 0x3f66a26e37f3c451ULL, 0xcf534ee874691d27ULL, 0x9c6c78a05044143cULL, 0x0e73b0562be8c358ULL, 0x9a34573f91f263a5ULL, 0xed3ce69e4f95da73ULL, 0x358ed3d269345de7ULL, 0x2380dfc2613e5fe1ULL, 0xd72ef2ae578bdc79ULL, 0x486e13cfe9947d87ULL, 0x6c59942613decd4aULL, 0x5e601fdfe19e7f81ULL, 0x049bc1ea752f5aeeULL, 0xf3197547adc16cb4ULL, 0x3e89d5da6d315ce4ULL, 0xefff08ebfb0cf704ULL, 0x47f2d42d98be266aULL, 0xb7c738abdb24ff1cULL, 0x11b9543b937eed2aULL, 0x36a24a13876fe825ULL, 0x26f4699c4ed39dbaULL, 0xee107f5fa1ce6fb1ULL, 0x8b8d0304028c8e8fULL, 0xe34f56c8647d192bULL, 0x9447e769ba1aa0fdULL, 0xdeea1ad3e717f00dULL, 0xba98113c1e978986ULL, 0x692d22783c330f11ULL, 0x311512381c1b0709ULL, 0xfd6ac5118629afecULL, 0x9bdb208bcb30fb10ULL, 0x5838304020280818ULL, 0x976b7ea85441153fULL, 0x7f232e6834390d17ULL, 0x2c1c18201014040cULL, 0x0b07060804050103ULL, 0xab2145078de964acULL, 0xca27f8b65b84df7cULL, 0x0d5f2997c5b3769aULL, 0x64720beff980798bULL, 0xdc29f4a6538edd7aULL, 0xb2b38ef5f4c93d47ULL, 0x8a6274b0584e163aULL, 0xa4bd82e5fcc33f41ULL, 0xfc85b2a5dceb3759ULL, 0xf81e734fa9c46db7ULL, 0x95a890dde0d83848ULL, 0x7708b1a1de67b9d6ULL, 0x2a4437bfd1a27395ULL, 0x3da54c1b836ae926ULL, 0xea8bbeb5d4e1355fULL, 0x6db6e392491c55ffULL, 0x3c4a3bafd9a87193ULL, 0x727c07fff18a7b8dULL, 0x9d830f140a868c89ULL, 0x214331b7d5a77296ULL, 0xb19f17341a928885ULL, 0xe4f80ee3ff09f607ULL, 0x33d6fc4da8822a7eULL, 0xafba84edf8c63e42ULL, 0x2887d9ca653b5ee2ULL, 0x4cf5d2259cbb2769ULL, 0xc0cf890a054346caULL, 0x74242860303c0c14ULL, 0xa026430f89ec65afULL, 0xdf056d67bdd568b8ULL, 0x8c3a5b2f99f861a3ULL, 0x1d090a180c0f0305ULL, 0x187dbc4623e2c15eULL, 0x7bb8ef82411657f9ULL, 0x9918cefe7fa9d667ULL, 0xf035ec86439ad976ULL, 0x1295cdfa7d2558e8ULL, 0xfb32ea8e479fd875ULL, 0xbd2f491785e366aaULL, 0x921fc8f67bacd764ULL, 0x83a69ccde8d23a4eULL, 0x4b428a0e07cfc845ULL, 0xb9b488fdf0cc3c44ULL, 0x90dc2683cf35fa13ULL, 0x63c553c462f496a7ULL, 0xa552f551a601a7f4ULL, 0x01ef77b45ac298b5ULL, 0x1abe5233977bec29ULL, 0x7c0fb7a9da62b8d5ULL, 0x226fa8763bfcc754ULL, 0xf66dc319822caeefULL, 0xd4026b6fb9d069bbULL, 0xbfeca762317a4bddULL, 0xd176dd31963dabe0ULL, 0xc778d1219e37a9e6ULL, 0xb6284f1f81e667a9ULL, 0x4e363c5028220a1eULL, 0xcbc88f02014647c9ULL, 0xc8e416c3ef1df20bULL, 0x032c99c1ee5bb5c2ULL, 0x6beecc0d88aa2266ULL, 0x4981647bb356e532ULL, 0x0cb05e239f71ee2fULL, 0x461da399c27cbedfULL, 0x38d1fa45ac872b7dULL, 0xe2a0217c3ebf819eULL, 0xa67e6c90485a1236ULL, 0xf4ae2d6c36b58398ULL, 0xf5415ad86c771b2dULL, 0x622a247038360e12ULL, 0x60e9ca058caf2365ULL, 0xf9f104fbf306f502ULL, 0xddc68312094c45cfULL, 0x76e7c61584a52163ULL, 0x71509e3e1fd1ce4fULL, 0xa9e2ab72397049dbULL, 0x09c4e87db09c2c74ULL, 0x8dd52c9bc33af916ULL, 0x54886e63bf59e637ULL, 0x1e2593d9e254b6c7ULL, 0x25d8f05da0882878ULL, 0x816572b85c4b1739ULL, 0xffa92b6432b0829bULL, 0xfe465cd068721a2eULL, 0xac961d2c169d8b80ULL, 0xbcc03ea3df21fe1fULL, 0xa7911b2412988a83ULL, 0x533f3648242d091bULL, 0x40458c0603cac946ULL, 0xd8b2354c26a18794ULL, 0x98f7b94a256b4ed2ULL, 0x659d7c5ba342e13eULL, 0x1fcae46db8962e72ULL, 0x42866273b753e431ULL, 0x6e9a7a53a747e03dULL, 0x2bab400b8b60eb20ULL, 0x59d747f47aea90adULL, 0xb85bff49aa0ea4f1ULL, 0xd25a44f078661e22ULL, 0xcebc395c2eab8592ULL, 0x873d5d279dfd60a0ULL, 0x0000000000000000ULL, 0x5afbde3594b1256fULL, 0xf2f602f3f703f401ULL, 0xd5ed1cdbe312f10eULL, 0x75cb5fd46afe94a1ULL, 0x45313a582c270b1dULL, 0x5f8f686bbb5ce734ULL, 0x1056238fc9bc759fULL, 0x07b7582b9b74ef2cULL, 0xe18cb8bdd0e4345cULL, 0xc697a695c4f53153ULL, 0x8f16c2ee77a3d461ULL, 0xa30adace67b7d06dULL, 0xd3b5334422a48697ULL, 0x556719d7e59b7e82ULL, 0xeb64c9018e23adeaULL, 0xa1c934bbd32efd1aULL, 0x2edff655a48d297bULL, 0xcd90a09dc0f03050ULL, 0x88a19ac5ecd73b4dULL, 0x30fa658c46d99fbcULL, 0x86d22a93c73ff815ULL, 0x2968ae7e3ff9c657ULL, 0xad796a984c5f1335ULL, 0x3a121430181e060aULL, 0x271b1e281411050fULL, 0x3461a46633f6c552ULL, 0xbb77668844551133ULL, 0x06582f9fc1b67799ULL, 0x436915c7ed917c84ULL, 0x797b01f7f58f7a8eULL, 0x6f750de7fd857888ULL, 0xf782b4add8ee365aULL, 0xc45448e0706c1c24ULL, 0x9eaf96d5e4dd394bULL, 0x1992cbf2792059ebULL, 0xe84850c060781828ULL, 0x70bfe98a451356faULL, 0x393e8df1f645b3c8ULL, 0x243787e9fa4ab0cdULL, 0x51fcd83d90b4246cULL, 0x7de0c01d80a02060ULL, 0x32398bf9f240b2cbULL, 0x4fd94be472e092abULL, 0x894eed71b615a3f8ULL, 0x137aba4e27e7c05dULL, 0xd6c1851a0d4944ccULL, 0x9133513795f762a6ULL, 0xb070608040501030ULL, 0x082b9fc9ea5eb4c1ULL, 0xc5bb3f542aae8491ULL, 0xe7d49722115243c5ULL, 0x44de4dec76e593a8ULL, 0x0574b65e2fedc25bULL, 0xb4eba16a357f4adeULL, 0x5b14a981ce73bddaULL, 0x808a050c06898f8cULL, 0x02c3ee75b4992d77ULL, 0x5013af89ca76bcd9ULL, 0x2df36f944ad69cb9ULL, 0xc90b6177b5df6abeULL, 0xfadd9d3a1d5d40c0ULL, 0x7a5798361bd4cf4cULL, 0x8249eb79b210a2fbULL, 0xe9a727743aba809dULL, 0x93f0bf42216e4fd1ULL, 0xd95d42f87c631f21ULL, 0x5d4c861e0fc5ca43ULL, 0xda71db399238aae3ULL, 0xecd3912a155742c6ULL }; static const u64 T7[256] = { 0x016ab9bb68d2d3baULL, 0xb1669ae5194dfc54ULL, 0xcd1465e293bc712fULL, 0x511b8725b9cd9c74ULL, 0xa457a2f70251f553ULL, 0x03bed6d0b86b68d3ULL, 0x04b5ded6bd6f6bd2ULL, 0xfe8552b36429d74dULL, 0xad4abafd0d5df050ULL, 0x63e009cf268ae9acULL, 0x84961c09830e8a8dULL, 0x1a4d91a579c6dcbfULL, 0x4d37a73daddd9070ULL, 0xa35caaf10755f652ULL, 0xe117a47bc852b39aULL, 0xf98e5ab5612dd44cULL, 0xac200346658f23eaULL, 0x1184e6c4a67362d5ULL, 0xc268cc55f166a497ULL, 0x0da8c6dcb2636ed1ULL, 0x99d085aaffcc5533ULL, 0xaa41b2fb0859f351ULL, 0x9c0fe2c72a71ed5bULL, 0x55ae59f304a2f7a6ULL, 0x20c1befe815f7fdeULL, 0xe5a27aad753dd848ULL, 0x7fcc29d7329ae5a8ULL, 0xe80abc71c75eb699ULL, 0x3be696e0904b70dbULL, 0x9edb8dacfac85632ULL, 0x2215d19551e6c4b7ULL, 0xceaab3322bd719fcULL, 0x93734b7048ab38e3ULL, 0xfd3b8463dc42bf9eULL, 0xd052fc41ef7eae91ULL, 0xe61cac7dcd56b09bULL, 0x947843764daf3be2ULL, 0x0661b1bd6dd6d0bbULL, 0xdaf1329b5819c341ULL, 0x17e55779cba5b26eULL, 0x5cb341f90baef2a5ULL, 0x4b561680c00b40cbULL, 0x0cc27f67dab1bd6bULL, 0xcc7edc59fb6ea295ULL, 0x409f61e11fbefea1ULL, 0xe3c3cb1018eb08f3ULL, 0x302fe1814ffeceb1ULL, 0x0e16100c0a080602ULL, 0x5e672e92db1749ccULL, 0x663f6ea2f33751c4ULL, 0x53cfe84e6974271dULL, 0x6c9ca07844503c14ULL, 0x730e56b0e82b58c3ULL, 0x349a3f57f291a563ULL, 0x3ced9ee6954f73daULL, 0x8e35d2d33469e75dULL, 0x8023c2df3e61e15fULL, 0x2ed7aef28b5779dcULL, 0x6e48cf1394e9877dULL, 0x596c2694de134acdULL, 0x605edf1f9ee1817fULL, 0x9b04eac12f75ee5aULL, 0x19f34775c1adb46cULL, 0x893edad5316de45cULL, 0xffefeb080cfb04f7ULL, 0xf2472dd4be986a26ULL, 0xc7b7ab3824db1cffULL, 0xb9113b547e932aedULL, 0xa236134a6f8725e8ULL, 0xf4269c69d34eba9dULL, 0x10ee5f7fcea1b16fULL, 0x8d8b04038c028f8eULL, 0x4fe3c8567d642b19ULL, 0x479469e71abafda0ULL, 0xeaded31a17e70df0ULL, 0x98ba3c11971e8689ULL, 0x2d697822333c110fULL, 0x153138121b1c0907ULL, 0x6afd11c52986ecafULL, 0xdb9b8b2030cb10fbULL, 0x3858403028201808ULL, 0x6b97a87e41543f15ULL, 0x237f682e3934170dULL, 0x1c2c201814100c04ULL, 0x070b080605040301ULL, 0x21ab0745e98dac64ULL, 0x27cab6f8845b7cdfULL, 0x5f0d9729b3c59a76ULL, 0x7264ef0b80f98b79ULL, 0x29dca6f48e537addULL, 0xb3b2f58ec9f4473dULL, 0x628ab0744e583a16ULL, 0xbda4e582c3fc413fULL, 0x85fca5b2ebdc5937ULL, 0x1ef84f73c4a9b76dULL, 0xa895dd90d8e04838ULL, 0x0877a1b167ded6b9ULL, 0x442abf37a2d19573ULL, 0xa53d1b4c6a8326e9ULL, 0x8beab5bee1d45f35ULL, 0xb66d92e31c49ff55ULL, 0x4a3caf3ba8d99371ULL, 0x7c72ff078af18d7bULL, 0x839d140f860a898cULL, 0x4321b731a7d59672ULL, 0x9fb13417921a8588ULL, 0xf8e4e30e09ff07f6ULL, 0xd6334dfc82a87e2aULL, 0xbaafed84c6f8423eULL, 0x8728cad93b65e25eULL, 0xf54c25d2bb9c6927ULL, 0xcfc00a894305ca46ULL, 0x247460283c30140cULL, 0x26a00f43ec89af65ULL, 0x05df676dd5bdb868ULL, 0x3a8c2f5bf899a361ULL, 0x091d180a0f0c0503ULL, 0x7d1846bce2235ec1ULL, 0xb87b82ef1641f957ULL, 0x1899fecea97f67d6ULL, 0x35f086ec9a4376d9ULL, 0x9512facd257de858ULL, 0x32fb8eea9f4775d8ULL, 0x2fbd1749e385aa66ULL, 0x1f92f6c8ac7b64d7ULL, 0xa683cd9cd2e84e3aULL, 0x424b0e8acf0745c8ULL, 0xb4b9fd88ccf0443cULL, 0xdc90832635cf13faULL, 0xc563c453f462a796ULL, 0x52a551f501a6f4a7ULL, 0xef01b477c25ab598ULL, 0xbe1a33527b9729ecULL, 0x0f7ca9b762dad5b8ULL, 0x6f2276a8fc3b54c7ULL, 0x6df619c32c82efaeULL, 0x02d46f6bd0b9bb69ULL, 0xecbf62a77a31dd4bULL, 0x76d131dd3d96e0abULL, 0x78c721d1379ee6a9ULL, 0x28b61f4fe681a967ULL, 0x364e503c22281e0aULL, 0xc8cb028f4601c947ULL, 0xe4c8c3161def0bf2ULL, 0x2c03c1995beec2b5ULL, 0xee6b0dccaa886622ULL, 0x81497b6456b332e5ULL, 0xb00c235e719f2feeULL, 0x1d4699a37cc2dfbeULL, 0xd13845fa87ac7d2bULL, 0xa0e27c21bf3e9e81ULL, 0x7ea6906c5a483612ULL, 0xaef46c2db5369883ULL, 0x41f5d85a776c2d1bULL, 0x2a6270243638120eULL, 0xe96005caaf8c6523ULL, 0xf1f9fb0406f302f5ULL, 0xc6dd12834c09cf45ULL, 0xe77615c6a5846321ULL, 0x50713e9ed11f4fceULL, 0xe2a972ab7039db49ULL, 0xc4097de89cb0742cULL, 0xd58d9b2c3ac316f9ULL, 0x8854636e59bf37e6ULL, 0x251ed99354e2c7b6ULL, 0xd8255df088a07828ULL, 0x6581b8724b5c3917ULL, 0xa9ff642bb0329b82ULL, 0x46fed05c72682e1aULL, 0x96ac2c1d9d16808bULL, 0xc0bca33e21df1ffeULL, 0x91a7241b9812838aULL, 0x3f5348362d241b09ULL, 0x4540068cca0346c9ULL, 0xb2d84c35a1269487ULL, 0xf7984ab96b25d24eULL, 0x9d655b7c42a33ee1ULL, 0xca1f6de496b8722eULL, 0x8642736253b731e4ULL, 0x9a6e537a47a73de0ULL, 0xab2b0b40608b20ebULL, 0xd759f447ea7aad90ULL, 0x5bb849ff0eaaf1a4ULL, 0x5ad2f0446678221eULL, 0xbcce5c39ab2e9285ULL, 0x3d87275dfd9da060ULL, 0x0000000000000000ULL, 0xfb5a35deb1946f25ULL, 0xf6f2f30203f701f4ULL, 0xedd5db1c12e30ef1ULL, 0xcb75d45ffe6aa194ULL, 0x3145583a272c1d0bULL, 0x8f5f6b685cbb34e7ULL, 0x56108f23bcc99f75ULL, 0xb7072b58749b2cefULL, 0x8ce1bdb8e4d05c34ULL, 0x97c695a6f5c45331ULL, 0x168feec2a37761d4ULL, 0x0aa3cedab7676dd0ULL, 0xb5d34433a4229786ULL, 0x6755d7199be5827eULL, 0x64eb01c9238eeaadULL, 0xc9a1bb342ed31afdULL, 0xdf2e55f68da47b29ULL, 0x90cd9da0f0c05030ULL, 0xa188c59ad7ec4d3bULL, 0xfa308c65d946bc9fULL, 0xd286932a3fc715f8ULL, 0x68297eaef93f57c6ULL, 0x79ad986a5f4c3513ULL, 0x123a30141e180a06ULL, 0x1b27281e11140f05ULL, 0x613466a4f63352c5ULL, 0x77bb886655443311ULL, 0x58069f2fb6c19977ULL, 0x6943c71591ed847cULL, 0x7b79f7018ff58e7aULL, 0x756fe70d85fd8878ULL, 0x82f7adb4eed85a36ULL, 0x54c4e0486c70241cULL, 0xaf9ed596dde44b39ULL, 0x9219f2cb2079eb59ULL, 0x48e8c05078602818ULL, 0xbf708ae91345fa56ULL, 0x3e39f18d45f6c8b3ULL, 0x3724e9874afacdb0ULL, 0xfc513dd8b4906c24ULL, 0xe07d1dc0a0806020ULL, 0x3932f98b40f2cbb2ULL, 0xd94fe44be072ab92ULL, 0x4e8971ed15b6f8a3ULL, 0x7a134ebae7275dc0ULL, 0xc1d61a85490dcc44ULL, 0x33913751f795a662ULL, 0x70b0806050403010ULL, 0x2b08c99f5eeac1b4ULL, 0xbbc5543fae2a9184ULL, 0xd4e722975211c543ULL, 0xde44ec4de576a893ULL, 0x74055eb6ed2f5bc2ULL, 0xebb46aa17f35de4aULL, 0x145b81a973cedabdULL, 0x8a800c0589068c8fULL, 0xc30275ee99b4772dULL, 0x135089af76cad9bcULL, 0xf32d946fd64ab99cULL, 0x0bc97761dfb5be6aULL, 0xddfa3a9d5d1dc040ULL, 0x577a3698d41b4ccfULL, 0x498279eb10b2fba2ULL, 0xa7e97427ba3a9d80ULL, 0xf09342bf6e21d14fULL, 0x5dd9f842637c211fULL, 0x4c5d1e86c50f43caULL, 0x71da39db3892e3aaULL, 0xd3ec2a915715c642ULL }; static const u64 c[KHAZAD_ROUNDS + 1] = { 0xba542f7453d3d24dULL, 0x50ac8dbf70529a4cULL, 0xead597d133515ba6ULL, 0xde48a899db32b7fcULL, 0xe39e919be2bb416eULL, 0xa5cb6b95a1f3b102ULL, 0xccc41d14c363da5dULL, 0x5fdc7dcd7f5a6c5cULL, 0xf726ffede89d6f8eULL }; static int khazad_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct khazad_ctx *ctx = crypto_tfm_ctx(tfm); int r; const u64 *S = T7; u64 K2, K1; K2 = get_unaligned_be64(&in_key[0]); K1 = get_unaligned_be64(&in_key[8]); /* setup the encrypt key */ for (r = 0; r <= KHAZAD_ROUNDS; r++) { ctx->E[r] = T0[(int)(K1 >> 56) ] ^ T1[(int)(K1 >> 48) & 0xff] ^ T2[(int)(K1 >> 40) & 0xff] ^ T3[(int)(K1 >> 32) & 0xff] ^ T4[(int)(K1 >> 24) & 0xff] ^ T5[(int)(K1 >> 16) & 0xff] ^ T6[(int)(K1 >> 8) & 0xff] ^ T7[(int)(K1 ) & 0xff] ^ c[r] ^ K2; K2 = K1; K1 = ctx->E[r]; } /* Setup the decrypt key */ ctx->D[0] = ctx->E[KHAZAD_ROUNDS]; for (r = 1; r < KHAZAD_ROUNDS; r++) { K1 = ctx->E[KHAZAD_ROUNDS - r]; ctx->D[r] = T0[(int)S[(int)(K1 >> 56) ] & 0xff] ^ T1[(int)S[(int)(K1 >> 48) & 0xff] & 0xff] ^ T2[(int)S[(int)(K1 >> 40) & 0xff] & 0xff] ^ T3[(int)S[(int)(K1 >> 32) & 0xff] & 0xff] ^ T4[(int)S[(int)(K1 >> 24) & 0xff] & 0xff] ^ T5[(int)S[(int)(K1 >> 16) & 0xff] & 0xff] ^ T6[(int)S[(int)(K1 >> 8) & 0xff] & 0xff] ^ T7[(int)S[(int)(K1 ) & 0xff] & 0xff]; } ctx->D[KHAZAD_ROUNDS] = ctx->E[0]; return 0; } static void khazad_crypt(const u64 roundKey[KHAZAD_ROUNDS + 1], u8 *dst, const u8 *src) { int r; u64 state; state = get_unaligned_be64(src) ^ roundKey[0]; for (r = 1; r < KHAZAD_ROUNDS; r++) { state = T0[(int)(state >> 56) ] ^ T1[(int)(state >> 48) & 0xff] ^ T2[(int)(state >> 40) & 0xff] ^ T3[(int)(state >> 32) & 0xff] ^ T4[(int)(state >> 24) & 0xff] ^ T5[(int)(state >> 16) & 0xff] ^ T6[(int)(state >> 8) & 0xff] ^ T7[(int)(state ) & 0xff] ^ roundKey[r]; } state = (T0[(int)(state >> 56) ] & 0xff00000000000000ULL) ^ (T1[(int)(state >> 48) & 0xff] & 0x00ff000000000000ULL) ^ (T2[(int)(state >> 40) & 0xff] & 0x0000ff0000000000ULL) ^ (T3[(int)(state >> 32) & 0xff] & 0x000000ff00000000ULL) ^ (T4[(int)(state >> 24) & 0xff] & 0x00000000ff000000ULL) ^ (T5[(int)(state >> 16) & 0xff] & 0x0000000000ff0000ULL) ^ (T6[(int)(state >> 8) & 0xff] & 0x000000000000ff00ULL) ^ (T7[(int)(state ) & 0xff] & 0x00000000000000ffULL) ^ roundKey[KHAZAD_ROUNDS]; put_unaligned_be64(state, dst); } static void khazad_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct khazad_ctx *ctx = crypto_tfm_ctx(tfm); khazad_crypt(ctx->E, dst, src); } static void khazad_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct khazad_ctx *ctx = crypto_tfm_ctx(tfm); khazad_crypt(ctx->D, dst, src); } static struct crypto_alg khazad_alg = { .cra_name = "khazad", .cra_driver_name = "khazad-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = KHAZAD_BLOCK_SIZE, .cra_ctxsize = sizeof (struct khazad_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = KHAZAD_KEY_SIZE, .cia_max_keysize = KHAZAD_KEY_SIZE, .cia_setkey = khazad_setkey, .cia_encrypt = khazad_encrypt, .cia_decrypt = khazad_decrypt } } }; static int __init khazad_mod_init(void) { int ret = 0; ret = crypto_register_alg(&khazad_alg); return ret; } static void __exit khazad_mod_fini(void) { crypto_unregister_alg(&khazad_alg); } module_init(khazad_mod_init); module_exit(khazad_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Khazad Cryptographic Algorithm"); MODULE_ALIAS_CRYPTO("khazad");
11 1 1 2 3 4 5 5 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * mpls tunnels An implementation mpls tunnels using the light weight tunnel * infrastructure * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/vmalloc.h> #include <net/ip.h> #include <net/dst.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/mpls_iptunnel.h> #include <linux/mpls_iptunnel.h> #include "internal.h" static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .len = sizeof(u32) }, [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) { /* The size of the layer 2.5 labels to be added for this route */ return en->labels * sizeof(struct mpls_shim_hdr); } static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; struct net_device *out_dev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; struct net *net; int err = 0; bool bos; int i; unsigned int ttl; /* Find the output device */ out_dev = dst->dev; net = dev_net(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Obtain the ttl using the following set of rules. * * LWT ttl propagation setting: * - disabled => use default TTL value from LWT * - enabled => use TTL value from IPv4/IPv6 header * - default => * Global ttl propagation setting: * - disabled => use default TTL value from global setting * - enabled => use TTL value from IPv4/IPv6 header */ if (dst->ops->family == AF_INET) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; rt6 = dst_rt6_info(dst); } else { goto drop; } /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto drop; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow_head(skb, hh_len + new_header_size)) goto drop; skb_set_inner_protocol(skb, skb->protocol); skb_reset_inner_network_header(skb); skb_push(skb, new_header_size); skb_reset_network_header(skb); skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); /* Push the new labels */ hdr = mpls_hdr(skb); bos = true; for (i = tun_encap_info->labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(tun_encap_info->label[i], ttl, 0, bos); bos = false; } mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); else err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], skb); } else err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, skb); } if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return LWTUNNEL_XMIT_DONE; drop: out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } static int mpls_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; u8 n_labels; int ret; ret = nla_parse_nested_deprecated(tb, MPLS_IPTUNNEL_MAX, nla, mpls_iptunnel_policy, extack); if (ret < 0) return ret; if (!tb[MPLS_IPTUNNEL_DST]) { NL_SET_ERR_MSG(extack, "MPLS_IPTUNNEL_DST attribute is missing"); return -EINVAL; } /* determine number of labels */ if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, &n_labels, NULL, extack)) return -EINVAL; newts = lwtunnel_state_alloc(struct_size(tun_encap_info, label, n_labels)); if (!newts) return -ENOMEM; tun_encap_info = mpls_lwtunnel_encap(newts); ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, &tun_encap_info->labels, tun_encap_info->label, extack); if (ret) goto errout; tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; if (tb[MPLS_IPTUNNEL_TTL]) { tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); /* TTL 0 implies propagate from IP header */ tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? MPLS_TTL_PROP_DISABLED : MPLS_TTL_PROP_ENABLED; } newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; return 0; errout: kfree(newts); *ts = NULL; return ret; } static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, tun_encap_info->label)) goto nla_put_failure; if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); nlsize = nla_total_size(tun_encap_info->labels * 4); if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) nlsize += nla_total_size(1); return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; if (a_hdr->labels != b_hdr->labels || a_hdr->ttl_propagate != b_hdr->ttl_propagate || a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < a_hdr->labels; l++) if (a_hdr->label[l] != b_hdr->label[l]) return 1; return 0; } static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) { return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_init(mpls_iptunnel_init); static void __exit mpls_iptunnel_exit(void) { lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_exit(mpls_iptunnel_exit); MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_SOFTDEP("post: mpls_gso"); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2");
4 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-or-later /* * scsi_netlink.c - SCSI Transport Netlink Interface * * Copyright (C) 2006 James Smart, Emulex Corporation */ #include <linux/time.h> #include <linux/jiffies.h> #include <linux/security.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/export.h> #include <net/sock.h> #include <net/netlink.h> #include <scsi/scsi_netlink.h> #include "scsi_priv.h" struct sock *scsi_nl_sock = NULL; EXPORT_SYMBOL_GPL(scsi_nl_sock); /** * scsi_nl_rcv_msg - Receive message handler. * @skb: socket receive buffer * * Description: Extracts message from a receive buffer. * Validates message header and calls appropriate transport message handler * * **/ static void scsi_nl_rcv_msg(struct sk_buff *skb) { struct nlmsghdr *nlh; struct scsi_nl_hdr *hdr; u32 rlen; int err, tport; while (skb->len >= NLMSG_HDRLEN) { err = 0; nlh = nlmsg_hdr(skb); if ((nlh->nlmsg_len < (sizeof(*nlh) + sizeof(*hdr))) || (skb->len < nlh->nlmsg_len)) { printk(KERN_WARNING "%s: discarding partial skb\n", __func__); return; } rlen = NLMSG_ALIGN(nlh->nlmsg_len); if (rlen > skb->len) rlen = skb->len; if (nlh->nlmsg_type != SCSI_TRANSPORT_MSG) { err = -EBADMSG; goto next_msg; } hdr = nlmsg_data(nlh); if ((hdr->version != SCSI_NL_VERSION) || (hdr->magic != SCSI_NL_MAGIC)) { err = -EPROTOTYPE; goto next_msg; } if (!netlink_capable(skb, CAP_SYS_ADMIN)) { err = -EPERM; goto next_msg; } if (nlh->nlmsg_len < (sizeof(*nlh) + hdr->msglen)) { printk(KERN_WARNING "%s: discarding partial message\n", __func__); goto next_msg; } /* * Deliver message to the appropriate transport */ tport = hdr->transport; if (tport == SCSI_NL_TRANSPORT) { switch (hdr->msgtype) { case SCSI_NL_SHOST_VENDOR: /* Locate the driver that corresponds to the message */ err = -ESRCH; break; default: err = -EBADR; break; } if (err) printk(KERN_WARNING "%s: Msgtype %d failed - err %d\n", __func__, hdr->msgtype, err); } else err = -ENOENT; next_msg: if ((err) || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err, NULL); skb_pull(skb, rlen); } } /** * scsi_netlink_init - Called by SCSI subsystem to initialize * the SCSI transport netlink interface * **/ void scsi_netlink_init(void) { struct netlink_kernel_cfg cfg = { .input = scsi_nl_rcv_msg, .groups = SCSI_NL_GRP_CNT, }; scsi_nl_sock = netlink_kernel_create(&init_net, NETLINK_SCSITRANSPORT, &cfg); if (!scsi_nl_sock) { printk(KERN_ERR "%s: register of receive handler failed\n", __func__); return; } return; } /** * scsi_netlink_exit - Called by SCSI subsystem to disable the SCSI transport netlink interface * **/ void scsi_netlink_exit(void) { if (scsi_nl_sock) { netlink_kernel_release(scsi_nl_sock); } return; }
182 179 3 15 169 4 19 412 411 412 47 365 19 19 1 1 76 76 76 76 76 75 34 34 34 34 34 34 32 32 32 66 41 2 39 13 10 1 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/kernel.h> #include <net/dst_metadata.h> #include <net/flow.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; } sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL(udp_sock_create4); static bool sk_saddr_any(struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #else return !sk->sk_rcv_saddr; #endif } void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; /* Disable multicast loopback */ inet_clear_bit(MC_LOOP, sk); /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); rcu_assign_sk_user_data(sk, cfg->sk_user_data); udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); udp_tunnel_update_gro_rcv(sk, true); if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && sk->sk_kern_sock) udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ASSERT_RTNL(); ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; for_each_netdev(net, dev) { udp_tunnel_nic_lock(dev); udp_tunnel_nic_add_port(dev, &ti); udp_tunnel_nic_unlock(dev); } } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ASSERT_RTNL(); ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; for_each_netdev(net, dev) { udp_tunnel_nic_lock(dev); udp_tunnel_nic_del_port(dev, &ti); udp_tunnel_nic_unlock(dev); } } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck, u16 ipcb_flags) { struct udphdr *uh; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, ipcb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, const unsigned long *flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; if (family == AF_INET) tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); else tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, struct net_device *dev, struct net *net, int oif, __be32 *saddr, const struct ip_tunnel_key *key, __be16 sport, __be16 dport, u8 tos, struct dst_cache *dst_cache) { struct rtable *rt = NULL; struct flowi4 fl4; #ifdef CONFIG_DST_CACHE if (dst_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; } #endif memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; fl4.flowi4_oif = oif; fl4.daddr = key->u.ipv4.dst; fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); return ERR_PTR(-ENETUNREACH); } if (rt->dst.dev == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); ip_rt_put(rt); return ERR_PTR(-ELOOP); } #ifdef CONFIG_DST_CACHE if (dst_cache) dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); #endif *saddr = fl4.saddr; return rt; } EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup); MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL");
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM scsi #if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCSI_H #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define scsi_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ scsi_opcode_name(TEST_UNIT_READY), \ scsi_opcode_name(REZERO_UNIT), \ scsi_opcode_name(REQUEST_SENSE), \ scsi_opcode_name(FORMAT_UNIT), \ scsi_opcode_name(READ_BLOCK_LIMITS), \ scsi_opcode_name(REASSIGN_BLOCKS), \ scsi_opcode_name(INITIALIZE_ELEMENT_STATUS), \ scsi_opcode_name(READ_6), \ scsi_opcode_name(WRITE_6), \ scsi_opcode_name(SEEK_6), \ scsi_opcode_name(READ_REVERSE), \ scsi_opcode_name(WRITE_FILEMARKS), \ scsi_opcode_name(SPACE), \ scsi_opcode_name(INQUIRY), \ scsi_opcode_name(RECOVER_BUFFERED_DATA), \ scsi_opcode_name(MODE_SELECT), \ scsi_opcode_name(RESERVE_6), \ scsi_opcode_name(RELEASE_6), \ scsi_opcode_name(COPY), \ scsi_opcode_name(ERASE), \ scsi_opcode_name(MODE_SENSE), \ scsi_opcode_name(START_STOP), \ scsi_opcode_name(RECEIVE_DIAGNOSTIC), \ scsi_opcode_name(SEND_DIAGNOSTIC), \ scsi_opcode_name(ALLOW_MEDIUM_REMOVAL), \ scsi_opcode_name(SET_WINDOW), \ scsi_opcode_name(READ_CAPACITY), \ scsi_opcode_name(READ_10), \ scsi_opcode_name(WRITE_10), \ scsi_opcode_name(SEEK_10), \ scsi_opcode_name(POSITION_TO_ELEMENT), \ scsi_opcode_name(WRITE_VERIFY), \ scsi_opcode_name(VERIFY), \ scsi_opcode_name(SEARCH_HIGH), \ scsi_opcode_name(SEARCH_EQUAL), \ scsi_opcode_name(SEARCH_LOW), \ scsi_opcode_name(SET_LIMITS), \ scsi_opcode_name(PRE_FETCH), \ scsi_opcode_name(READ_POSITION), \ scsi_opcode_name(SYNCHRONIZE_CACHE), \ scsi_opcode_name(LOCK_UNLOCK_CACHE), \ scsi_opcode_name(READ_DEFECT_DATA), \ scsi_opcode_name(MEDIUM_SCAN), \ scsi_opcode_name(COMPARE), \ scsi_opcode_name(COPY_VERIFY), \ scsi_opcode_name(WRITE_BUFFER), \ scsi_opcode_name(READ_BUFFER), \ scsi_opcode_name(UPDATE_BLOCK), \ scsi_opcode_name(READ_LONG), \ scsi_opcode_name(WRITE_LONG), \ scsi_opcode_name(CHANGE_DEFINITION), \ scsi_opcode_name(WRITE_SAME), \ scsi_opcode_name(UNMAP), \ scsi_opcode_name(READ_TOC), \ scsi_opcode_name(LOG_SELECT), \ scsi_opcode_name(LOG_SENSE), \ scsi_opcode_name(XDWRITEREAD_10), \ scsi_opcode_name(MODE_SELECT_10), \ scsi_opcode_name(RESERVE_10), \ scsi_opcode_name(RELEASE_10), \ scsi_opcode_name(MODE_SENSE_10), \ scsi_opcode_name(PERSISTENT_RESERVE_IN), \ scsi_opcode_name(PERSISTENT_RESERVE_OUT), \ scsi_opcode_name(VARIABLE_LENGTH_CMD), \ scsi_opcode_name(REPORT_LUNS), \ scsi_opcode_name(MAINTENANCE_IN), \ scsi_opcode_name(MAINTENANCE_OUT), \ scsi_opcode_name(MOVE_MEDIUM), \ scsi_opcode_name(EXCHANGE_MEDIUM), \ scsi_opcode_name(READ_12), \ scsi_opcode_name(WRITE_12), \ scsi_opcode_name(WRITE_VERIFY_12), \ scsi_opcode_name(SEARCH_HIGH_12), \ scsi_opcode_name(SEARCH_EQUAL_12), \ scsi_opcode_name(SEARCH_LOW_12), \ scsi_opcode_name(READ_ELEMENT_STATUS), \ scsi_opcode_name(SEND_VOLUME_TAG), \ scsi_opcode_name(WRITE_LONG_2), \ scsi_opcode_name(READ_16), \ scsi_opcode_name(WRITE_16), \ scsi_opcode_name(VERIFY_16), \ scsi_opcode_name(WRITE_SAME_16), \ scsi_opcode_name(ZBC_OUT), \ scsi_opcode_name(ZBC_IN), \ scsi_opcode_name(SERVICE_ACTION_IN_16), \ scsi_opcode_name(READ_32), \ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ scsi_opcode_name(WRITE_ATOMIC_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } #define show_hostbyte_name(val) \ __print_symbolic(val, \ scsi_hostbyte_name(DID_OK), \ scsi_hostbyte_name(DID_NO_CONNECT), \ scsi_hostbyte_name(DID_BUS_BUSY), \ scsi_hostbyte_name(DID_TIME_OUT), \ scsi_hostbyte_name(DID_BAD_TARGET), \ scsi_hostbyte_name(DID_ABORT), \ scsi_hostbyte_name(DID_PARITY), \ scsi_hostbyte_name(DID_ERROR), \ scsi_hostbyte_name(DID_RESET), \ scsi_hostbyte_name(DID_BAD_INTR), \ scsi_hostbyte_name(DID_PASSTHROUGH), \ scsi_hostbyte_name(DID_SOFT_ERROR), \ scsi_hostbyte_name(DID_IMM_RETRY), \ scsi_hostbyte_name(DID_REQUEUE), \ scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED), \ scsi_hostbyte_name(DID_TRANSPORT_FAILFAST)) #define scsi_statusbyte_name(result) { result, #result } #define show_statusbyte_name(val) \ __print_symbolic(val, \ scsi_statusbyte_name(SAM_STAT_GOOD), \ scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION), \ scsi_statusbyte_name(SAM_STAT_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_BUSY), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT), \ scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED), \ scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL), \ scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE), \ scsi_statusbyte_name(SAM_STAT_TASK_ABORTED)) #define scsi_prot_op_name(result) { result, #result } #define show_prot_op_name(val) \ __print_symbolic(val, \ scsi_prot_op_name(SCSI_PROT_NORMAL), \ scsi_prot_op_name(SCSI_PROT_READ_INSERT), \ scsi_prot_op_name(SCSI_PROT_WRITE_STRIP), \ scsi_prot_op_name(SCSI_PROT_READ_STRIP), \ scsi_prot_op_name(SCSI_PROT_WRITE_INSERT), \ scsi_prot_op_name(SCSI_PROT_READ_PASS), \ scsi_prot_op_name(SCSI_PROT_WRITE_PASS)) const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int); #define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len) TRACE_EVENT(scsi_dispatch_cmd_start, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len)) ); #define scsi_rtn_name(result) { result, #result } #define show_rtn_name(val) \ __print_symbolic(val, \ scsi_rtn_name(SCSI_MLQUEUE_HOST_BUSY), \ scsi_rtn_name(SCSI_MLQUEUE_DEVICE_BUSY), \ scsi_rtn_name(SCSI_MLQUEUE_EH_RETRY), \ scsi_rtn_name(SCSI_MLQUEUE_TARGET_BUSY)) TRACE_EVENT(scsi_dispatch_cmd_error, TP_PROTO(struct scsi_cmnd *cmd, int rtn), TP_ARGS(cmd, rtn), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, rtn ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->rtn = rtn; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \ " rtn=%s", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), show_rtn_name(__entry->rtn) ) ); DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, result ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) __field( u8, sense_key ) __field( u8, asc ) __field( u8, ascq ) ), TP_fast_assign( struct scsi_sense_hdr sshdr; __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->result = cmd->result; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); if (cmd->sense_buffer && SCSI_SENSE_VALID(cmd) && scsi_command_normalize_sense(cmd, &sshdr)) { __entry->sense_key = sshdr.sense_key; __entry->asc = sshdr.asc; __entry->ascq = sshdr.ascq; } else { __entry->sense_key = 0; __entry->asc = 0; __entry->ascq = 0; } ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u " \ "prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s) " \ "result=(driver=%s host=%s message=%s status=%s) " "sense=(key=%#x asc=%#x ascq=%#x)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), "DRIVER_OK", show_hostbyte_name(((__entry->result) >> 16) & 0xff), "COMMAND_COMPLETE", show_statusbyte_name(__entry->result & 0xff), __entry->sense_key, __entry->asc, __entry->ascq) ); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); TRACE_EVENT(scsi_eh_wakeup, TP_PROTO(struct Scsi_Host *shost), TP_ARGS(shost), TP_STRUCT__entry( __field( unsigned int, host_no ) ), TP_fast_assign( __entry->host_no = shost->host_no; ), TP_printk("host_no=%u", __entry->host_no) ); #endif /* _TRACE_SCSI_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 // SPDX-License-Identifier: GPL-2.0-or-later /* Validate the trust chain of a PKCS#7 message. * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS7: "fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/asn1.h> #include <linux/key.h> #include <keys/asymmetric-type.h> #include <crypto/public_key.h> #include "pkcs7_parser.h" /* * Check the trust on one PKCS#7 SignedInfo block. */ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo, struct key *trust_keyring) { struct public_key_signature *sig = sinfo->sig; struct x509_certificate *x509, *last = NULL, *p; struct key *key; int ret; kenter(",%u,", sinfo->index); if (sinfo->unsupported_crypto) { kleave(" = -ENOPKG [cached]"); return -ENOPKG; } for (x509 = sinfo->signer; x509; x509 = x509->signer) { if (x509->seen) { if (x509->verified) goto verified; kleave(" = -ENOKEY [cached]"); return -ENOKEY; } x509->seen = true; /* Look to see if this certificate is present in the trusted * keys. */ key = find_asymmetric_key(trust_keyring, x509->id, x509->skid, NULL, false); if (!IS_ERR(key)) { /* One of the X.509 certificates in the PKCS#7 message * is apparently the same as one we already trust. * Verify that the trusted variant can also validate * the signature on the descendant. */ pr_devel("sinfo %u: Cert %u as key %x\n", sinfo->index, x509->index, key_serial(key)); goto matched; } if (key == ERR_PTR(-ENOMEM)) return -ENOMEM; /* Self-signed certificates form roots of their own, and if we * don't know them, then we can't accept them. */ if (x509->signer == x509) { kleave(" = -ENOKEY [unknown self-signed]"); return -ENOKEY; } might_sleep(); last = x509; sig = last->sig; } /* No match - see if the root certificate has a signer amongst the * trusted keys. */ if (last && (last->sig->auth_ids[0] || last->sig->auth_ids[1])) { key = find_asymmetric_key(trust_keyring, last->sig->auth_ids[0], last->sig->auth_ids[1], NULL, false); if (!IS_ERR(key)) { x509 = last; pr_devel("sinfo %u: Root cert %u signer is key %x\n", sinfo->index, x509->index, key_serial(key)); goto matched; } if (PTR_ERR(key) != -ENOKEY) return PTR_ERR(key); } /* As a last resort, see if we have a trusted public key that matches * the signed info directly. */ key = find_asymmetric_key(trust_keyring, sinfo->sig->auth_ids[0], NULL, NULL, false); if (!IS_ERR(key)) { pr_devel("sinfo %u: Direct signer is key %x\n", sinfo->index, key_serial(key)); x509 = NULL; sig = sinfo->sig; goto matched; } if (PTR_ERR(key) != -ENOKEY) return PTR_ERR(key); kleave(" = -ENOKEY [no backref]"); return -ENOKEY; matched: ret = verify_signature(key, sig); key_put(key); if (ret < 0) { if (ret == -ENOMEM) return ret; kleave(" = -EKEYREJECTED [verify %d]", ret); return -EKEYREJECTED; } verified: if (x509) { x509->verified = true; for (p = sinfo->signer; p != x509; p = p->signer) p->verified = true; } kleave(" = 0"); return 0; } /** * pkcs7_validate_trust - Validate PKCS#7 trust chain * @pkcs7: The PKCS#7 certificate to validate * @trust_keyring: Signing certificates to use as starting points * * Validate that the certificate chain inside the PKCS#7 message intersects * keys we already know and trust. * * Returns, in order of descending priority: * * (*) -EKEYREJECTED if a signature failed to match for which we have a valid * key, or: * * (*) 0 if at least one signature chain intersects with the keys in the trust * keyring, or: * * (*) -ENOPKG if a suitable crypto module couldn't be found for a check on a * chain. * * (*) -ENOKEY if we couldn't find a match for any of the signature chains in * the message. * * May also return -ENOMEM. */ int pkcs7_validate_trust(struct pkcs7_message *pkcs7, struct key *trust_keyring) { struct pkcs7_signed_info *sinfo; struct x509_certificate *p; int cached_ret = -ENOKEY; int ret; for (p = pkcs7->certs; p; p = p->next) p->seen = false; for (sinfo = pkcs7->signed_infos; sinfo; sinfo = sinfo->next) { ret = pkcs7_validate_trust_one(pkcs7, sinfo, trust_keyring); switch (ret) { case -ENOKEY: continue; case -ENOPKG: if (cached_ret == -ENOKEY) cached_ret = -ENOPKG; continue; case 0: cached_ret = 0; continue; default: return ret; } } return cached_ret; } EXPORT_SYMBOL_GPL(pkcs7_validate_trust);
22 10 16 58 4 1 3 53 52 12 1 9 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over X/Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.5.2 * * Author: Michal Ostrowski <mostrows@speakeasy.net> * * 051000 : Initialization cleanup * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/init.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/kmod.h> #include <net/sock.h> #include <linux/uaccess.h> static const struct pppox_proto *pppox_protos[PX_MAX_PROTO + 1]; int register_pppox_proto(int proto_num, const struct pppox_proto *pp) { if (proto_num < 0 || proto_num > PX_MAX_PROTO) return -EINVAL; if (pppox_protos[proto_num]) return -EALREADY; pppox_protos[proto_num] = pp; return 0; } void unregister_pppox_proto(int proto_num) { if (proto_num >= 0 && proto_num <= PX_MAX_PROTO) pppox_protos[proto_num] = NULL; } void pppox_unbind_sock(struct sock *sk) { /* Clear connection to ppp device, if attached. */ if (sk->sk_state & (PPPOX_BOUND | PPPOX_CONNECTED)) { ppp_unregister_channel(&pppox_sk(sk)->chan); sk->sk_state = PPPOX_DEAD; } } EXPORT_SYMBOL(register_pppox_proto); EXPORT_SYMBOL(unregister_pppox_proto); EXPORT_SYMBOL(pppox_unbind_sock); int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int rc; lock_sock(sk); switch (cmd) { case PPPIOCGCHAN: { int index; rc = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; rc = -EINVAL; index = ppp_channel_index(&po->chan); if (put_user(index , (int __user *) arg)) break; rc = 0; sk->sk_state |= PPPOX_BOUND; break; } default: rc = pppox_protos[sk->sk_protocol]->ioctl ? pppox_protos[sk->sk_protocol]->ioctl(sock, cmd, arg) : -ENOTTY; } release_sock(sk); return rc; } EXPORT_SYMBOL(pppox_ioctl); #ifdef CONFIG_COMPAT int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { if (cmd == PPPOEIOCSFWD32) cmd = PPPOEIOCSFWD; return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(pppox_compat_ioctl); #endif static int pppox_create(struct net *net, struct socket *sock, int protocol, int kern) { int rc = -EPROTOTYPE; if (protocol < 0 || protocol > PX_MAX_PROTO) goto out; rc = -EPROTONOSUPPORT; if (!pppox_protos[protocol]) request_module("net-pf-%d-proto-%d", PF_PPPOX, protocol); if (!pppox_protos[protocol] || !try_module_get(pppox_protos[protocol]->owner)) goto out; rc = pppox_protos[protocol]->create(net, sock, kern); module_put(pppox_protos[protocol]->owner); out: return rc; } static const struct net_proto_family pppox_proto_family = { .family = PF_PPPOX, .create = pppox_create, .owner = THIS_MODULE, }; static int __init pppox_init(void) { return sock_register(&pppox_proto_family); } static void __exit pppox_exit(void) { sock_unregister(PF_PPPOX); } module_init(pppox_init); module_exit(pppox_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver (generic socket layer)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PPPOX);
12 12 12 12 21 13 13 13 13 13 13 13 13 13 13 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> #include "tun_vnet.h" #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) static struct proto tap_proto = { .name = "tap", .owner = THIS_MODULE, .obj_size = sizeof(struct tap_queue), }; #define TAP_NUM_DEVS (1U << MINORBITS) static LIST_HEAD(major_list); struct major_info { struct rcu_head rcu; dev_t major; struct idr minor_idr; spinlock_t minor_lock; const char *device_name; struct list_head next; }; #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } /* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ static int tap_enable_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { int err = -EINVAL; ASSERT_RTNL(); if (q->enabled) goto out; err = 0; rcu_assign_pointer(tap->taps[tap->numvtaps], q); q->queue_index = tap->numvtaps; q->enabled = true; tap->numvtaps++; out: return err; } /* Requires RTNL */ static int tap_set_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { if (tap->numqueues == MAX_TAP_QUEUES) return -EBUSY; rcu_assign_pointer(q->tap, tap); rcu_assign_pointer(tap->taps[tap->numvtaps], q); sock_hold(&q->sk); q->file = file; q->queue_index = tap->numvtaps; q->enabled = true; file->private_data = q; list_add_tail(&q->next, &tap->queue_list); tap->numvtaps++; tap->numqueues++; return 0; } static int tap_disable_queue(struct tap_queue *q) { struct tap_dev *tap; struct tap_queue *nq; ASSERT_RTNL(); if (!q->enabled) return -EINVAL; tap = rtnl_dereference(q->tap); if (tap) { int index = q->queue_index; BUG_ON(index >= tap->numvtaps); nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(tap->taps[index], nq); RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); q->enabled = false; tap->numvtaps--; } return 0; } /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. */ static void tap_put_queue(struct tap_queue *q) { struct tap_dev *tap; rtnl_lock(); tap = rtnl_dereference(q->tap); if (tap) { if (q->enabled) BUG_ON(tap_disable_queue(q)); tap->numqueues--; RCU_INIT_POINTER(q->tap, NULL); sock_put(&q->sk); list_del_init(&q->next); } rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); } /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function. */ static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) goto out; if (numvtaps == 1) goto single; /* Check if we can use flow to select a queue */ rxq = skb_get_hash(skb); if (rxq) { queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } if (likely(skb_rx_queue_recorded(skb))) { rxq = skb_get_rx_queue(skb); while (unlikely(rxq >= numvtaps)) rxq -= numvtaps; queue = rcu_dereference(tap->taps[rxq]); goto out; } single: queue = rcu_dereference(tap->taps[0]); out: return queue; } /* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL. */ void tap_del_queues(struct tap_dev *tap) { struct tap_queue *q, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { list_del_init(&q->next); RCU_INIT_POINTER(q->tap, NULL); if (q->enabled) tap->numvtaps--; tap->numqueues--; sock_put(&q->sk); } BUG_ON(tap->numvtaps); BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */ tap->numvtaps = MAX_TAP_QUEUES; } EXPORT_SYMBOL_GPL(tap_del_queues); rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; enum skb_drop_reason drop_reason; tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS; q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS; skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled. */ if (q->flags & IFF_VNET_HDR) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; if (IS_ERR(segs)) { drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop; } if (!segs) { if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } goto wake_up; } consume_skb(skb); skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; kfree_skb_reason(skb, drop_reason); kfree_skb_list_reason(next, drop_reason); break; } } } else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) { drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop; } if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } } wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); return RX_HANDLER_CONSUMED; drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); static struct major_info *tap_get_major(int major) { struct major_info *tap_major; list_for_each_entry_rcu(tap_major, &major_list, next) { if (tap_major->major == major) return tap_major; } return NULL; } int tap_get_minor(dev_t major, struct tap_dev *tap) { int retval = -ENOMEM; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { retval = -EINVAL; goto unlock; } spin_lock(&tap_major->minor_lock); retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC); if (retval >= 0) { tap->minor = retval; } else if (retval == -ENOSPC) { netdev_err(tap->dev, "Too many tap devices\n"); retval = -EINVAL; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return retval < 0 ? retval : 0; } EXPORT_SYMBOL_GPL(tap_get_minor); void tap_free_minor(dev_t major, struct tap_dev *tap) { struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { goto unlock; } spin_lock(&tap_major->minor_lock); if (tap->minor) { idr_remove(&tap_major->minor_idr, tap->minor); tap->minor = 0; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(tap_free_minor); static struct tap_dev *dev_get_by_tap_file(int major, int minor) { struct net_device *dev = NULL; struct tap_dev *tap; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(major); if (!tap_major) { tap = NULL; goto unlock; } spin_lock(&tap_major->minor_lock); tap = idr_find(&tap_major->minor_idr, minor); if (tap) { dev = tap->dev; dev_hold(dev); } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return tap; } static void tap_sock_write_space(struct sock *sk) { wait_queue_head_t *wqueue; if (!sock_writeable(sk) || !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } static void tap_sock_destruct(struct sock *sk) { struct tap_queue *q = container_of(sk, struct tap_queue, sk); ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb); } static int tap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct tap_dev *tap; struct tap_queue *q; int err = -ENODEV; rtnl_lock(); tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); if (!tap) goto err; err = -ENOMEM; q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tap_proto, 0); if (!q) goto err; if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) { sk_free(&q->sk); goto err; } init_waitqueue_head(&q->sock.wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; sock_init_data_uid(&q->sock, &q->sk, current_fsuid()); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); /* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly. */ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(&q->sk, SOCK_ZEROCOPY); err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put; } /* tap groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; dev_put(tap->dev); rtnl_unlock(); return err; err_put: sock_put(&q->sk); err: if (tap) dev_put(tap->dev); rtnl_unlock(); return err; } static int tap_release(struct inode *inode, struct file *file) { struct tap_queue *q = file->private_data; tap_put_queue(q); return 0; } static __poll_t tap_poll(struct file *file, poll_table *wait) { struct tap_queue *q = file->private_data; __poll_t mask = EPOLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wq.wait, wait); if (!ptr_ring_empty(&q->ring)) mask |= EPOLLIN | EPOLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= EPOLLOUT | EPOLLWRNORM; out: return mask; } static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); unsigned long len = total_len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; int hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; size_t linear; enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); hdr_len = tun_vnet_hdr_get(vnet_hdr_len, q->flags, from, &vnet_hdr); if (hdr_len < 0) { err = hdr_len; goto err; } len -= vnet_hdr_len; } err = -EINVAL; if (unlikely(len < ETH_HLEN)) goto err; if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = clamp(hdr_len ?: GOODCOPY_LEN, ETH_HLEN, good_linear); linear = copylen; i = *from; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; linear = clamp(hdr_len, ETH_HLEN, good_linear); } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, linear, noblock, &err); if (!skb) goto err; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto err_kfree; } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; rcu_read_lock(); tap = rcu_dereference(q->tap); if (!tap) { kfree_skb(skb); rcu_read_unlock(); return total_len; } skb->dev = tap->dev; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree; } } skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } dev_queue_xmit(skb); rcu_read_unlock(); return total_len; err_kfree: kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; return tap_get_user(q, NULL, from, noblock); } /* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, const struct sk_buff *skb, struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); if (ret) return ret; ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); if (ret) return ret; } total = vnet_hdr_len; total += skb->len; if (skb_vlan_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); total += VLAN_HLEN; ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } ret = skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); ssize_t ret = 0; if (!iov_iter_count(to)) { kfree_skb(skb); return 0; } if (skb) goto put; while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tap_do_read(q, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; return ret; } static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) { struct tap_dev *tap; ASSERT_RTNL(); tap = rtnl_dereference(q->tap); if (tap) dev_hold(tap->dev); return tap; } static void tap_put_tap_dev(struct tap_dev *tap) { dev_put(tap->dev); } static int tap_ioctl_set_queue(struct file *file, unsigned int flags) { struct tap_queue *q = file->private_data; struct tap_dev *tap; int ret; tap = tap_get_tap_dev(q); if (!tap) return -EINVAL; if (flags & IFF_ATTACH_QUEUE) ret = tap_enable_queue(tap, file, q); else if (flags & IFF_DETACH_QUEUE) ret = tap_disable_queue(q); else ret = -EINVAL; tap_put_tap_dev(tap); return ret; } static int set_offload(struct tap_queue *q, unsigned long arg) { struct tap_dev *tap; netdev_features_t features; netdev_features_t feature_mask = 0; tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK; features = tap->dev->features; if (arg & TUN_F_CSUM) { feature_mask = NETIF_F_HW_CSUM; if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4) feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } /* TODO: for now USO4 and USO6 should work simultaneously */ if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= NETIF_F_GSO_UDP_L4; } /* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) || (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; /* tap_features are the same as features on tun/tap and * reflect user expectations. */ tap->tap_features = feature_mask; if (tap->update_features) tap->update_features(tap, features); return 0; } /* * provide compatibility with generic tun/tap interface */ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tap_queue *q = file->private_data; struct tap_dev *tap; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; struct sockaddr_storage ss; int s; int ret; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = (q->flags & ~TAP_IFFEATURES) | u; return ret; case TUNGETIFF: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || put_user(u, &ifr->ifr_flags)) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; rtnl_lock(); ret = tap_ioctl_set_queue(file, u); rtnl_unlock(); return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL; q->sk.sk_sndbuf = s; return 0; case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO | TUN_F_USO4 | TUN_F_USO6)) return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); return ret; case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; netif_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: if (copy_from_user(&ss, &ifr->ifr_hwaddr, sizeof(ifr->ifr_hwaddr))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } if (tap->dev->addr_len > sizeof(ifr->ifr_hwaddr)) ret = -EINVAL; else ret = dev_set_mac_address_user(tap->dev, &ss, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; default: return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp); } } static const struct file_operations tap_fops = { .owner = THIS_MODULE, .open = tap_open, .release = tap_release, .read_iter = tap_read_iter, .write_iter = tap_write_iter, .poll = tap_poll, .unlocked_ioctl = tap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) { struct virtio_net_hdr *gso = xdp->data_hard_start; int buflen = xdp->frame_sz; int vnet_hdr_len = 0; struct tap_dev *tap; struct sk_buff *skb; int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { err = -EINVAL; goto err; } if (q->flags & IFF_VNET_HDR) vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { err = -ENOMEM; goto err; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree; } /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return 0; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; int i; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); } return 0; } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct sk_buff *skb = m->msg_control; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { kfree_skb(skb); return -EINVAL; } ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } return ret; } static int tap_peek_len(struct socket *sock) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tap_socket_ops = { .sendmsg = tap_sendmsg, .recvmsg = tap_recvmsg, .peek_len = tap_peek_len, }; /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring; } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, __skb_array_destroy_skb); kfree(rings); return ret; } EXPORT_SYMBOL_GPL(tap_queue_resize); static int tap_list_add(dev_t major, const char *device_name) { struct major_info *tap_major; tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC); if (!tap_major) return -ENOMEM; tap_major->major = MAJOR(major); idr_init(&tap_major->minor_idr); spin_lock_init(&tap_major->minor_lock); tap_major->device_name = device_name; list_add_tail_rcu(&tap_major->next, &major_list); return 0; } int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, const char *device_name, struct module *module) { int err; err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); if (err) goto out1; cdev_init(tap_cdev, &tap_fops); tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; err = tap_list_add(*tap_major, device_name); if (err) goto out3; return 0; out3: cdev_del(tap_cdev); out2: unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); out1: return err; } EXPORT_SYMBOL_GPL(tap_create_cdev); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) { struct major_info *tap_major, *tmp; cdev_del(tap_cdev); unregister_chrdev_region(major, TAP_NUM_DEVS); list_for_each_entry_safe(tap_major, tmp, &major_list, next) { if (tap_major->major == MAJOR(major)) { idr_destroy(&tap_major->minor_idr); list_del_rcu(&tap_major->next); kfree_rcu(tap_major, rcu); } } } EXPORT_SYMBOL_GPL(tap_destroy_cdev); MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
10 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PFXLEN_H #define _PFXLEN_H #include <asm/byteorder.h> #include <linux/netfilter.h> #include <net/tcp.h> /* Prefixlen maps, by Jan Engelhardt */ extern const union nf_inet_addr ip_set_netmask_map[]; extern const union nf_inet_addr ip_set_hostmask_map[]; static inline __be32 ip_set_netmask(u8 pfxlen) { return ip_set_netmask_map[pfxlen].ip; } static inline const __be32 * ip_set_netmask6(u8 pfxlen) { return &ip_set_netmask_map[pfxlen].ip6[0]; } static inline u32 ip_set_hostmask(u8 pfxlen) { return (__force u32) ip_set_hostmask_map[pfxlen].ip; } static inline const __be32 * ip_set_hostmask6(u8 pfxlen) { return &ip_set_hostmask_map[pfxlen].ip6[0]; } extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr); #define ip_set_mask_from_to(from, to, cidr) \ do { \ from &= ip_set_hostmask(cidr); \ to = from | ~ip_set_hostmask(cidr); \ } while (0) static inline void ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip->ip6[0] &= ip_set_netmask6(prefix)[0]; ip->ip6[1] &= ip_set_netmask6(prefix)[1]; ip->ip6[2] &= ip_set_netmask6(prefix)[2]; ip->ip6[3] &= ip_set_netmask6(prefix)[3]; } #endif /*_PFXLEN_H */
96 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 // SPDX-License-Identifier: GPL-2.0-or-later /* Sysfs attributes of bond slaves * * Copyright (c) 2014 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/bonding.h> struct slave_attribute { struct attribute attr; ssize_t (*show)(struct slave *, char *); }; #define SLAVE_ATTR_RO(_name) \ const struct slave_attribute slave_attr_##_name = __ATTR_RO(_name) static ssize_t state_show(struct slave *slave, char *buf) { switch (bond_slave_state(slave)) { case BOND_STATE_ACTIVE: return sysfs_emit(buf, "active\n"); case BOND_STATE_BACKUP: return sysfs_emit(buf, "backup\n"); default: return sysfs_emit(buf, "UNKNOWN\n"); } } static SLAVE_ATTR_RO(state); static ssize_t mii_status_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%s\n", bond_slave_link_status(slave->link)); } static SLAVE_ATTR_RO(mii_status); static ssize_t link_failure_count_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", slave->link_failure_count); } static SLAVE_ATTR_RO(link_failure_count); static ssize_t perm_hwaddr_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%*phC\n", slave->dev->addr_len, slave->perm_hwaddr); } static SLAVE_ATTR_RO(perm_hwaddr); static ssize_t queue_id_show(struct slave *slave, char *buf) { return sysfs_emit(buf, "%d\n", READ_ONCE(slave->queue_id)); } static SLAVE_ATTR_RO(queue_id); static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) { const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg) return sysfs_emit(buf, "%d\n", agg->aggregator_identifier); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_aggregator_id); static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_actor_oper_port_state); static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) { const struct port *ad_port; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; if (ad_port->aggregator) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } return sysfs_emit(buf, "N/A\n"); } static SLAVE_ATTR_RO(ad_partner_oper_port_state); static const struct attribute *slave_attrs[] = { &slave_attr_state.attr, &slave_attr_mii_status.attr, &slave_attr_link_failure_count.attr, &slave_attr_perm_hwaddr.attr, &slave_attr_queue_id.attr, &slave_attr_ad_aggregator_id.attr, &slave_attr_ad_actor_oper_port_state.attr, &slave_attr_ad_partner_oper_port_state.attr, NULL }; #define to_slave_attr(_at) container_of(_at, struct slave_attribute, attr) static ssize_t slave_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct slave_attribute *slave_attr = to_slave_attr(attr); struct slave *slave = to_slave(kobj); return slave_attr->show(slave, buf); } const struct sysfs_ops slave_sysfs_ops = { .show = slave_show, }; int bond_sysfs_slave_add(struct slave *slave) { return sysfs_create_files(&slave->kobj, slave_attrs); } void bond_sysfs_slave_del(struct slave *slave) { sysfs_remove_files(&slave->kobj, slave_attrs); }
3 106 409 420 56 5 419 11 48 48 11 48 48 420 7 14 7 7 7 452 444 372 74 444 444 48 11 420 420 45 48 444 14 14 4 3 7 4 1 5 5 2 2 14 442 2 14 14 73 2 72 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers */ #include <linux/module.h> #include <linux/mutex.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/tracepoint.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/static_key.h> enum tp_func_state { TP_FUNC_0, TP_FUNC_1, TP_FUNC_2, TP_FUNC_N, }; extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; enum tp_transition_sync { TP_TRANSITION_SYNC_1_0_1, TP_TRANSITION_SYNC_N_2_1, _NR_TP_TRANSITION_SYNC, }; struct tp_transition_snapshot { unsigned long rcu; bool ongoing; }; /* Protected by tracepoints_mutex */ static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; static void tp_rcu_get_state(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); snapshot->ongoing = true; } static void tp_rcu_cond_sync(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); snapshot->ongoing = false; } /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; #ifdef CONFIG_MODULES /* * Tracepoint module list mutex protects the local module list. */ static DEFINE_MUTEX(tracepoint_module_list_mutex); /* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* * tracepoints_mutex protects the builtin and module tracepoints. * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. */ struct tp_probes { struct rcu_head rcu; struct tracepoint_func probes[]; }; /* Called in removal of a func but failed to allocate a new tp_funcs */ static void tp_stub_func(void) { return; } static inline void *allocate_probes(int count) { struct tp_probes *p = kmalloc(struct_size(p, probes, count), GFP_KERNEL); return p == NULL ? NULL : p->probes; } static void rcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); if (tracepoint_is_faultable(tp)) call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); else call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } static void debug_print_probes(struct tracepoint_func *funcs) { int i; if (!tracepoint_debug || !funcs) return; for (i = 0; funcs[i].func; i++) printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, int prio) { struct tracepoint_func *old, *new; int iter_probes; /* Iterate over old probe array. */ int nr_probes = 0; /* Counter for probes */ int pos = -1; /* Insertion position into new array */ if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); debug_print_probes(*funcs); old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Skip stub functions. */ if (old[iter_probes].func == tp_func->func && old[iter_probes].data == tp_func->data) return ERR_PTR(-EEXIST); nr_probes++; } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) { nr_probes = 0; for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Insert before probes of lower priority */ if (pos < 0 && old[iter_probes].prio < prio) pos = nr_probes++; new[nr_probes++] = old[iter_probes]; } if (pos < 0) pos = nr_probes++; /* nr_probes now points to the end of the new array */ } else { pos = 0; nr_probes = 1; /* must point at end of array */ } new[pos] = *tp_func; new[nr_probes].func = NULL; *funcs = new; debug_print_probes(*funcs); return old; } static void *func_remove(struct tracepoint_func **funcs, struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; old = *funcs; if (!old) return ERR_PTR(-ENOENT); debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { if ((old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) || old[nr_probes].func == tp_stub_func) nr_del++; } } /* * If probe is NULL, then nr_probes = nr_del = 0, and then the * entire entry will be removed. */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ *funcs = NULL; debug_print_probes(*funcs); return old; } else { int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ new = allocate_probes(nr_probes - nr_del + 1); if (new) { for (i = 0; old[i].func; i++) { if ((old[i].func != tp_func->func || old[i].data != tp_func->data) && old[i].func != tp_stub_func) new[j++] = old[i]; } new[nr_probes - nr_del].func = NULL; *funcs = new; } else { /* * Failed to allocate, replace the old function * with calls to tp_stub_func. */ for (i = 0; old[i].func; i++) { if (old[i].func == tp_func->func && old[i].data == tp_func->data) WRITE_ONCE(old[i].func, tp_stub_func); } *funcs = old; } } debug_print_probes(*funcs); return old; } /* * Count the number of functions (enum tp_func_state) in a tp_funcs array. */ static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs) { if (!tp_funcs) return TP_FUNC_0; if (!tp_funcs[1].func) return TP_FUNC_1; if (!tp_funcs[2].func) return TP_FUNC_2; return TP_FUNC_N; /* 3 or more */ } static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) { void *func = tp->iterator; /* Synthetic events do not have static call sites */ if (!tp->static_call_key) return; if (nr_func_state(tp_funcs) == TP_FUNC_1) func = tp_funcs[0].func; __static_call_update(tp->static_call_key, tp->static_call_tramp, func); } /* * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio, bool warn) { struct tracepoint_func *old, *tp_funcs; int ret; if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) { ret = tp->ext->regfunc(); if (ret < 0) return ret; } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } /* * rcu_assign_pointer has as smp_store_release() which makes sure * that the new probe callbacks array is consistent before setting * a pointer to it. This array is referenced by __DO_TRACE from * include/linux/tracepoint.h using rcu_dereference_sched(). */ switch (nr_func_state(tp_funcs)) { case TP_FUNC_1: /* 0->1 */ /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* * Iterator callback installed before updating tp->funcs. * Requires ordering between RCU assign/dereference and * static call update/call. */ fallthrough; case TP_FUNC_N: /* N->N+1 (N>1) */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>1) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /* * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ static int tracepoint_remove_func(struct tracepoint *tp, struct tracepoint_func *func) { struct tracepoint_func *old, *tp_funcs; tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (WARN_ON_ONCE(IS_ERR(old))) return PTR_ERR(old); if (tp_funcs == old) /* Failed allocating new tp_funcs, replaced func with stub */ return 0; switch (nr_func_state(tp_funcs)) { case TP_FUNC_0: /* 1->0 */ /* Removed last function */ if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key)) tp->ext->unregfunc(); static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, NULL); /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1); break; case TP_FUNC_1: /* 2->1 */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. If the first * element's data has changed, then force the synchronization * to prevent current readers that have loaded the old data * from calling the new function. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); break; case TP_FUNC_2: /* N->N-1 (N>2) */ fallthrough; case TP_FUNC_N: rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /** * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Same as tracepoint_probe_register_prio() except that it will not warn * if the tracepoint is already registered. */ int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, false); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist); /** * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, true); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); /** * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); } EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer * @data: tracepoint data * * Returns 0 if ok, error value on error. */ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); static void for_each_tracepoint_range( tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { tracepoint_ptr_t *iter; if (!begin) return; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module * coming/going with the tracepoint_module_list_mutex held. * The notifier block callback should expect a "struct tp_module" data * pointer. */ int register_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data * pointer. */ int unregister_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); /* * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return 0; /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) return -ENOMEM; tp_mod->mod = mod; mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); mutex_unlock(&tracepoint_module_list_mutex); return 0; } static void tracepoint_module_going(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) { if (tp_mod->mod == mod) { blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_GOING, tp_mod); list_del(&tp_mod->list); kfree(tp_mod); /* * Called the going notifier before checking for * quiescence. */ for_each_tracepoint_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints, tp_module_going_check_quiescent, NULL); break; } } /* * In the case of modules that were tainted at "coming", we'll simply * walk through the list without finding it. We cannot use the "tainted" * flag on "going", in case a module taints the kernel only after being * loaded. */ mutex_unlock(&tracepoint_module_list_mutex); } static int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; switch (val) { case MODULE_STATE_COMING: ret = tracepoint_module_coming(mod); break; case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: tracepoint_module_going(mod); break; case MODULE_STATE_UNFORMED: break; } return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; static __init int init_tracepoints(void) { int ret; ret = register_module_notifier(&tracepoint_module_nb); if (ret) pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } __initcall(init_tracepoints); /** * for_each_tracepoint_in_module - iteration on all tracepoints in a module * @mod: module * @fct: callback * @priv: private data */ void for_each_tracepoint_in_module(struct module *mod, void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { tracepoint_ptr_t *begin, *end, *iter; lockdep_assert_held(&tracepoint_module_list_mutex); if (!mod) return; begin = mod->tracepoints_ptrs; end = mod->tracepoints_ptrs + mod->num_tracepoints; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), mod, priv); } /** * for_each_module_tracepoint - iteration on all tracepoints in all modules * @fct: callback * @priv: private data */ void for_each_module_tracepoint(void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { struct tp_module *tp_mod; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) for_each_tracepoint_in_module(tp_mod->mod, fct, priv); mutex_unlock(&tracepoint_module_list_mutex); } #endif /* CONFIG_MODULES */ /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback * @priv: private data */ void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv) { for_each_tracepoint_range(__start___tracepoints_ptrs, __stop___tracepoints_ptrs, fct, priv); } EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; int syscall_regfunc(void) { struct task_struct *p, *t; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; return 0; } void syscall_unregfunc(void) { struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } } #endif
288 288 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 // SPDX-License-Identifier: GPL-2.0-only /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges * * Copyright (c) 2019-2020 Red Hat GmbH * * Author: Stefano Brivio <sbrivio@redhat.com> */ /** * DOC: Theory of Operation * * * Problem * ------- * * Match packet bytes against entries composed of ranged or non-ranged packet * field specifiers, mapping them to arbitrary references. For example: * * :: * * --- fields ---> * | [net],[port],[net]... => [reference] * entries [net],[port],[net]... => [reference] * | [net],[port],[net]... => [reference] * V ... * * where [net] fields can be IP ranges or netmasks, and [port] fields are port * ranges. Arbitrary packet fields can be matched. * * * Algorithm Overview * ------------------ * * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally * relies on the consideration that every contiguous range in a space of b bits * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], * as also illustrated in Section 9 of [Kogan 2014]. * * Classification against a number of entries, that require matching given bits * of a packet field, is performed by grouping those bits in sets of arbitrary * size, and classifying packet bits one group at a time. * * Example: * to match the source port (16 bits) of a packet, we can divide those 16 bits * in 4 groups of 4 bits each. Given the entry: * 0000 0001 0101 1001 * and a packet with source port: * 0000 0001 1010 1001 * first and second groups match, but the third doesn't. We conclude that the * packet doesn't match the given entry. * * Translate the set to a sequence of lookup tables, one per field. Each table * has two dimensions: bit groups to be matched for a single packet field, and * all the possible values of said groups (buckets). Input entries are * represented as one or more rules, depending on the number of composing * netmasks for the given field specifier, and a group match is indicated as a * set bit, with number corresponding to the rule index, in all the buckets * whose value matches the entry for a given group. * * Rules are mapped between fields through an array of x, n pairs, with each * item mapping a matched rule to one or more rules. The position of the pair in * the array indicates the matched rule to be mapped to the next field, x * indicates the first rule index in the next field, and n the amount of * next-field rules the current rule maps to. * * The mapping array for the last field maps to the desired references. * * To match, we perform table lookups using the values of grouped packet bits, * and use a sequence of bitwise operations to progressively evaluate rule * matching. * * A stand-alone, reference implementation, also including notes about possible * future optimisations, is available at: * https://pipapo.lameexcu.se/ * * Insertion * --------- * * - For each packet field: * * - divide the b packet bits we want to classify into groups of size t, * obtaining ceil(b / t) groups * * Example: match on destination IP address, with t = 4: 32 bits, 8 groups * of 4 bits each * * - allocate a lookup table with one column ("bucket") for each possible * value of a group, and with one row for each group * * Example: 8 groups, 2^4 buckets: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * * - map the bits we want to classify for the current field, for a given * entry, to a single rule for non-ranged and netmask set items, and to one * or multiple rules for ranges. Ranges are expanded to composing netmasks * by pipapo_expand(). * * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 * - rule #0: 10.0.0.5 * - rule #1: 192.168.1.0/24 * - rule #2: 192.168.2.0/31 * * - insert references to the rules in the lookup table, selecting buckets * according to bit values of a rule in the given group. This is done by * pipapo_insert(). * * Example: given: * - rule #0: 10.0.0.5 mapping to buckets * < 0 10 0 0 0 0 0 5 > * - rule #1: 192.168.1.0/24 mapping to buckets * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > * - rule #2: 192.168.2.0/31 mapping to buckets * < 12 0 10 8 0 2 0 < 0..1 > > * * these bits are set in the lookup table: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * - if this is not the last field in the set, fill a mapping array that maps * rules from the lookup table to rules belonging to the same entry in * the next lookup table, done by pipapo_map(). * * Note that as rules map to contiguous ranges of rules, given how netmask * expansion and insertion is performed, &union nft_pipapo_map_bucket stores * this information as pairs of first rule index, rule count. * * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, * given lookup table #0 for field 0 (see example above): * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * and lookup table #1 for field 1 with: * - rule #0: 1024 mapping to buckets * < 0 0 4 0 > * - rule #1: 2048 mapping to buckets * < 0 0 5 0 > * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0,1 * 1 0,1 * 2 0 1 * 3 0,1 * * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 * (rules #1, #2) to 2048 in lookup table #2 (rule #1): * * :: * * rule indices in current field: 0 1 2 * map to rules in next field: 0 1 1 * * - if this is the last field in the set, fill a mapping array that maps * rules from the last lookup table to element pointers, also done by * pipapo_map(). * * Note that, in this implementation, we have two elements (start, end) for * each entry. The pointer to the end element is stored in this array, and * the pointer to the start element is linked from it. * * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. * From the rules of lookup table #1 as mapped above: * * :: * * rule indices in last field: 0 1 * map to elements: 0x66 0x42 * * * Matching * -------- * * We use a result bitmap, with the size of a single lookup table bucket, to * represent the matching state that applies at every algorithm step. This is * done by pipapo_lookup(). * * - For each packet field: * * - start with an all-ones result bitmap (res_map in pipapo_lookup()) * * - perform a lookup into the table corresponding to the current field, * for each group, and at every group, AND the current result bitmap with * the value from the lookup table bucket * * :: * * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from * insertion examples. * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for * convenience in this example. Initial result bitmap is 0xff, the steps * below show the value of the result bitmap after each group is processed: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 * * 1 1,2 0 * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 * * 2 0 1,2 * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 * * 3 0 1,2 * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 * * 4 0,1,2 * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 * * 5 0 1 2 * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 * * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 * * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 * * - at the next field, start with a new, all-zeroes result bitmap. For each * bit set in the previous result bitmap, fill the new result bitmap * (fill_map in pipapo_lookup()) with the rule indices from the * corresponding buckets of the mapping field for this field, done by * pipapo_refill() * * Example: with mapping table from insertion examples, with the current * result bitmap from the previous example, 0x02: * * :: * * rule indices in current field: 0 1 2 * map to rules in next field: 0 1 1 * * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be * set. * * We can now extend this example to cover the second iteration of the step * above (lookup and AND bitmap): assuming the port field is * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table * for "port" field from pre-computation example: * * :: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0,1 * 1 0,1 * 2 0 1 * 3 0,1 * * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] * & 0x3 [bucket 0], resulting bitmap is 0x2. * * - if this is the last field in the set, look up the value from the mapping * array corresponding to the final result bitmap * * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for * last field from insertion example: * * :: * * rule indices in last field: 0 1 * map to elements: 0x66 0x42 * * the matching element is at 0x42. * * * References * ---------- * * [Ligatti 2010] * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with * Automatic Time-space Tradeoffs * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion * Ori Rottenstreich and Isaac Keslassy. * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf * * [Kogan 2014] * SAX-PAC (Scalable And eXpressive PAcket Classification) * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" /** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits * @len: Length of bitmap in longs * @rules: Number of rules in field * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers * @match_only: Find a single bit and return, don't fill * * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. * * For each bit set in map, select the bucket from mapping table with index * corresponding to the position of the bit set. Use start bit and amount of * bits specified in bucket to fill region in dst. * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; unsigned int k; int ret = -1; for (k = 0; k < len; k++) { bitset = map[k]; while (bitset) { unsigned long t = bitset & -bitset; int r = __builtin_ctzl(bitset); int i = k * BITS_PER_LONG + r; if (unlikely(i >= rules)) { map[k] = 0; return -1; } if (match_only) { bitmap_clear(map, i, 1); return i; } ret = 0; bitmap_set(dst, mt[i].to, mt[i].n); bitset ^= t; } map[k] = 0; } return ret; } /** * pipapo_get_slow() - Get matching element reference given key data * @m: storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: timestamp to check for expired elements * * For more details, see DOC: Theory of Operation. * * This is the main lookup function. It matches key data against either * the working match set or the uncommitted copy, depending on what the * caller passed to us. * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup * (datapath lookup) pass the active copy. * The insertion path will pass the uncommitted working copy. * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp) { unsigned long *res_map, *fill_map, *map; struct nft_pipapo_scratch *scratch; const struct nft_pipapo_field *f; bool map_index; int i; local_bh_disable(); scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) goto out; __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); res_map = map + (map_index ? m->bsize_max : 0); fill_map = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; int b; /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ if (likely(f->bb == 8)) pipapo_and_field_buckets_8bit(f, res_map, data); else pipapo_and_field_buckets_4bit(f, res_map, data); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' * pointer if any. * * Now res_map contains the matching bitmap, and fill_map is the * bitmap for the next field. */ next_match: b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, last); if (b < 0) { scratch->map_index = map_index; __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return NULL; } if (last) { struct nft_pipapo_elem *e; e = f->mt[b].e; if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || !nft_set_elem_active(&e->ext, genmask))) goto next_match; /* Last field: we're just returning the key without * filling the initial bitmap for the next field, so the * current inactive bitmap is clean and can be reused as * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return e; } /* Swap bitmap indices: res_map is the initial bitmap for the * next field, and fill_map is guaranteed to be all-zeroes at * this point. */ map_index = !map_index; swap(res_map, fill_map); data += NFT_PIPAPO_GROUPS_PADDING(f); } __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); return NULL; } /** * pipapo_get() - Get matching element reference given key data * @m: Storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: Timestamp to check for expired elements * * This is a dispatcher function, either calling out the generic C * implementation or, if available, the AVX2 one. * This helper is only called from the control plane, with either RCU * read lock or transaction mutex held. * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp) { struct nft_pipapo_elem *e; local_bh_disable(); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { e = pipapo_get_avx2(m, data, genmask, tstamp); local_bh_enable(); return e; } #endif e = pipapo_get_slow(m, data, genmask, tstamp); local_bh_enable(); return e; } /** * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace * @set: nftables API set representation * @key: pointer to nft registers containing key data * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. * Unlike other set types, this uses 0 instead of nft_genmask_cur(). * * This is because new (future) elements are not reachable from * priv->match, they get added to priv->clone instead. * When the commit phase flips the generation bitmask, the * 'now old' entries are skipped but without the 'now current' * elements becoming visible. Using nft_genmask_cur() thus creates * inconsistent state: matching old entries get skipped but thew * newly matching entries are unreachable. * * GENMASK_ANY doesn't work for the same reason: old-gen entries get * skipped, new-gen entries are only reachable from priv->clone. * * nft_pipapo_commit swaps ->clone and ->match shortly after the * genbit flip. As ->clone doesn't contain the old entries in the first * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ const struct nft_set_ext * nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } /** * nft_pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * @flags: Unused * * This function is called from the control plane path under * RCU read lock. * * Return: set element private pointer or ERR_PTR(-ENOENT). */ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; e = pipapo_get(m, (const u8 *)elem->key.val.data, nft_genmask_cur(net), get_jiffies_64()); if (!e) return ERR_PTR(-ENOENT); return &e->priv; } /** * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize * @f: Field containing mapping table * @old_rules: Amount of existing mapped rules * @rules: Amount of new rules to map * * Return: 0 on success, negative error code on failure. */ static int pipapo_realloc_mt(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) { union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); unsigned int rules_alloc = rules; might_sleep(); if (unlikely(rules == 0)) goto out_free; /* growing and enough space left, no action needed */ if (rules > old_rules && f->rules_alloc > rules) return 0; /* downsize and extra slack has not grown too large */ if (rules < old_rules) { unsigned int remove = f->rules_alloc - rules; if (remove < (2u * extra)) return 0; } /* If set needs more than one page of memory for rules then * allocate another extra page to avoid frequent reallocation. */ if (rules > extra && check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; if (rules_alloc > (INT_MAX / sizeof(*new_mt))) return -ENOMEM; new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; if (old_mt) memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); if (rules > old_rules) { memset(new_mt + old_rules, 0, (rules - old_rules) * sizeof(*new_mt)); } out_free: f->rules_alloc = rules_alloc; f->mt = new_mt; kvfree(old_mt); return 0; } /** * lt_calculate_size() - Get storage size for lookup table with overflow check * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @bsize: Size of each bucket in lookup table, in longs * * Return: allocation size including alignment overhead, negative on overflow */ static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, unsigned int bsize) { ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); if (check_mul_overflow(ret, bsize, &ret)) return -1; if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) return -1; if (ret > INT_MAX) return -1; return ret; } /** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables * @old_rules: Previous amount of rules in field * @rules: New amount of rules * * Increase, decrease or maintain tables size depending on new amount of rules, * and copy data over. In case the new size is smaller, throw away data for * highest-numbered rules. * * Return: 0 on success, -ENOMEM on allocation failure. */ static int pipapo_resize(struct nft_pipapo_field *f, unsigned int old_rules, unsigned int rules) { long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; unsigned int new_bucket_size, copy; int group, bucket, err; ssize_t lt_size; if (rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); #ifdef NFT_PIPAPO_ALIGN new_bucket_size = roundup(new_bucket_size, NFT_PIPAPO_ALIGN / sizeof(*new_lt)); #endif if (new_bucket_size == f->bsize) goto mt; if (new_bucket_size > f->bsize) copy = f->bsize; else copy = new_bucket_size; lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); if (lt_size < 0) return -ENOMEM; new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; new_p = NFT_PIPAPO_LT_ALIGN(new_lt); old_p = NFT_PIPAPO_LT_ALIGN(old_lt); for (group = 0; group < f->groups; group++) { for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { memcpy(new_p, old_p, copy * sizeof(*new_p)); new_p += copy; old_p += copy; if (new_bucket_size > f->bsize) new_p += new_bucket_size - f->bsize; else old_p += f->bsize - new_bucket_size; } } mt: err = pipapo_realloc_mt(f, old_rules, rules); if (err) { kvfree(new_lt); return err; } if (new_lt) { f->bsize = new_bucket_size; f->lt = new_lt; kvfree(old_lt); } return 0; } /** * pipapo_bucket_set() - Set rule bit in bucket given group and group value * @f: Field containing lookup table * @rule: Rule index * @group: Group index * @v: Value of bit group */ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, int v) { unsigned long *pos; pos = NFT_PIPAPO_LT_ALIGN(f->lt); pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; pos += f->bsize * v; __set_bit(rule, pos); } /** * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits * @old_groups: Number of current groups * @bsize: Size of one bucket, in longs * @old_lt: Pointer to the current lookup table * @new_lt: Pointer to the new, pre-allocated lookup table * * Each bucket with index b in the new lookup table, belonging to group g, is * filled with the bit intersection between: * - bucket with index given by the upper 4 bits of b, from group g, and * - bucket with index given by the lower 4 bits of b, from group g + 1 * * That is, given buckets from the new lookup table N(x, y) and the old lookup * table O(x, y), with x bucket index, and y group index: * * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) * * This ensures equivalence of the matching results on lookup. Two examples in * pictures: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 * 0 ^ * 1 | ^ * ... ( & ) | * / \ | * / \ .-( & )-. * / bucket \ | | * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | * 0 / \ | | * 1 \ | | * 2 | --' * 3 '- * ... */ static void pipapo_lt_4b_to_8b(int old_groups, int bsize, unsigned long *old_lt, unsigned long *new_lt) { int g, b, i; for (g = 0; g < old_groups / 2; g++) { int src_g0 = g * 2, src_g1 = g * 2 + 1; for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { int src_b0 = b / NFT_PIPAPO_BUCKETS(4); int src_b1 = b % NFT_PIPAPO_BUCKETS(4); int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; for (i = 0; i < bsize; i++) { *new_lt = old_lt[src_i0 * bsize + i] & old_lt[src_i1 * bsize + i]; new_lt++; } } } } /** * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits * @old_groups: Number of current groups * @bsize: Size of one bucket, in longs * @old_lt: Pointer to the current lookup table * @new_lt: Pointer to the new, pre-allocated lookup table * * Each bucket with index b in the new lookup table, belonging to group g, is * filled with the bit union of: * - all the buckets with index such that the upper four bits of the lower byte * equal b, from group g, with g odd * - all the buckets with index such that the lower four bits equal b, from * group g, with g even * * That is, given buckets from the new lookup table N(x, y) and the old lookup * table O(x, y), with x bucket index, and y group index: * * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) * * where U() denotes the arbitrary union operation (binary OR of n terms). This * ensures equivalence of the matching results on lookup. */ static void pipapo_lt_8b_to_4b(int old_groups, int bsize, unsigned long *old_lt, unsigned long *new_lt) { int g, b, bsrc, i; memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * sizeof(unsigned long)); for (g = 0; g < old_groups * 2; g += 2) { int src_g = g / 2; for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); bsrc++) { if (((bsrc & 0xf0) >> 4) != b) continue; for (i = 0; i < bsize; i++) new_lt[i] |= old_lt[bsrc * bsize + i]; } new_lt += bsize; } for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); bsrc++) { if ((bsrc & 0x0f) != b) continue; for (i = 0; i < bsize; i++) new_lt[i] |= old_lt[bsrc * bsize + i]; } new_lt += bsize; } } } /** * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed * @f: Field containing lookup table */ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { unsigned int groups, bb; unsigned long *new_lt; ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; lt_size = lt_calculate_size(groups, bb, f->bsize); if (lt_size < 0) return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; lt_size = lt_calculate_size(groups, bb, f->bsize); if (lt_size < 0) return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. */ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) return; } else { return; } new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; if (f->bb == 4 && bb == 8) { pipapo_lt_4b_to_8b(f->groups, f->bsize, NFT_PIPAPO_LT_ALIGN(f->lt), NFT_PIPAPO_LT_ALIGN(new_lt)); } else if (f->bb == 8 && bb == 4) { pipapo_lt_8b_to_4b(f->groups, f->bsize, NFT_PIPAPO_LT_ALIGN(f->lt), NFT_PIPAPO_LT_ALIGN(new_lt)); } else { BUG(); } f->groups = groups; f->bb = bb; kvfree(f->lt); f->lt = new_lt; } /** * pipapo_insert() - Insert new rule in field given input key and mask length * @f: Field containing lookup table * @k: Input key for classification, without nftables padding * @mask_bits: Length of mask; matches field length for non-ranged entry * * Insert a new rule reference in lookup buckets corresponding to k and * mask_bits. * * Return: 1 on success (one rule inserted), negative error code on failure. */ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { unsigned int rule = f->rules, group, ret, bit_offset = 0; ret = pipapo_resize(f, f->rules, f->rules + 1); if (ret) return ret; f->rules++; for (group = 0; group < f->groups; group++) { int i, v; u8 mask; v = k[group / (BITS_PER_BYTE / f->bb)]; v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); v >>= (BITS_PER_BYTE - bit_offset) - f->bb; bit_offset += f->bb; bit_offset %= BITS_PER_BYTE; if (mask_bits >= (group + 1) * f->bb) { /* Not masked */ pipapo_bucket_set(f, rule, group, v); } else if (mask_bits <= group * f->bb) { /* Completely masked */ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) pipapo_bucket_set(f, rule, group, i); } else { /* The mask limit falls on this group */ mask = GENMASK(f->bb - 1, 0); mask >>= mask_bits - group * f->bb; for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { if ((i & ~mask) == (v & ~mask)) pipapo_bucket_set(f, rule, group, i); } } } pipapo_lt_bits_adjust(f); return 1; } /** * pipapo_step_diff() - Check if setting @step bit in netmask would change it * @base: Mask we are expanding * @step: Step bit for given expansion step * @len: Total length of mask space (set and unset bits), bytes * * Convenience function for mask expansion. * * Return: true if step bit changes mask (i.e. isn't set), false otherwise. */ static bool pipapo_step_diff(u8 *base, int step, int len) { /* Network order, byte-addressed */ #ifdef __BIG_ENDIAN__ return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); #else return !(BIT(step % BITS_PER_BYTE) & base[len - 1 - step / BITS_PER_BYTE]); #endif } /** * pipapo_step_after_end() - Check if mask exceeds range end with given step * @base: Mask we are expanding * @end: End of range * @step: Step bit for given expansion step, highest bit to be set * @len: Total length of mask space (set and unset bits), bytes * * Convenience function for mask expansion. * * Return: true if mask exceeds range setting step bits, false otherwise. */ static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, int len) { u8 tmp[NFT_PIPAPO_MAX_BYTES]; int i; memcpy(tmp, base, len); /* Network order, byte-addressed */ for (i = 0; i <= step; i++) #ifdef __BIG_ENDIAN__ tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); #else tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); #endif return memcmp(tmp, end, len) > 0; } /** * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry * @base: Netmask base * @step: Step bit to sum * @len: Netmask length, bytes */ static void pipapo_base_sum(u8 *base, int step, int len) { bool carry = false; int i; /* Network order, byte-addressed */ #ifdef __BIG_ENDIAN__ for (i = step / BITS_PER_BYTE; i < len; i++) { #else for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { #endif if (carry) base[i]++; else base[i] += 1 << (step % BITS_PER_BYTE); if (base[i]) break; carry = true; } } /** * pipapo_expand() - Expand to composing netmasks, insert into lookup table * @f: Field containing lookup table * @start: Start of range * @end: End of range * @len: Length of value in bits * * Expand range to composing netmasks and insert corresponding rule references * in lookup buckets. * * Return: number of inserted rules on success, negative error code on failure. */ static int pipapo_expand(struct nft_pipapo_field *f, const u8 *start, const u8 *end, int len) { int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); u8 base[NFT_PIPAPO_MAX_BYTES]; memcpy(base, start, bytes); while (memcmp(base, end, bytes) <= 0) { int err; step = 0; while (pipapo_step_diff(base, step, bytes)) { if (pipapo_step_after_end(base, end, step, bytes)) break; step++; if (step >= len) { if (!masks) { err = pipapo_insert(f, base, 0); if (err < 0) return err; masks = 1; } goto out; } } err = pipapo_insert(f, base, len - step); if (err < 0) return err; masks++; pipapo_base_sum(base, step, bytes); } out: return masks; } /** * pipapo_map() - Insert rules in mapping tables, mapping them between fields * @m: Matching data, including mapping table * @map: Table of rule maps: array of first rule and amount of rules * in next field a given rule maps to, for each field * @e: For last field, nft_set_ext pointer matching rules map to */ static void pipapo_map(struct nft_pipapo_match *m, union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], struct nft_pipapo_elem *e) { struct nft_pipapo_field *f; int i, j; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { for (j = 0; j < map[i].n; j++) { f->mt[map[i].to + j].to = map[i + 1].to; f->mt[map[i].to + j].n = map[i + 1].n; } } /* Last field: map to ext instead of mapping to next field */ for (j = 0; j < map[i].n; j++) f->mt[map[i].to + j].e = e; } /** * pipapo_free_scratch() - Free per-CPU map at original address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; s = *per_cpu_ptr(m->scratch, cpu); kvfree(s); } /** * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results * @clone: Copy of matching data with pending insertions and deletions * @bsize_max: Maximum bucket size, scratch maps cover two buckets * * Return: 0 on success, -ENOMEM on failure. */ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, unsigned long bsize_max) { int i; for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have * a bigger allocated size now (this is only called on * insertion), but the extra space won't be used by any * CPU as new elements are not inserted and m->bsize_max * is not updated. */ return -ENOMEM; } pipapo_free_scratch(clone, i); local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; } return 0; } static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) { #ifdef CONFIG_PROVE_LOCKING const struct net *net = read_pnet(&set->net); return lockdep_is_held(&nft_pernet(net)->commit_mutex); #else return true; #endif } static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); /** * pipapo_maybe_clone() - Build clone for pending data changes, if not existing * @set: nftables API set representation * * Return: newly created or existing clone, if any. NULL on allocation failure */ static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; if (priv->clone) return priv->clone; m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); priv->clone = pipapo_clone(m); return priv->clone; } /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element * * Return: 0 on success, error pointer on failure. */ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_field *f; const u8 *start_p, *end_p; int i, bsize_max, err = 0; if (!m) return -ENOMEM; if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; dup = pipapo_get(m, start, genmask, tstamp); if (dup) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; dup_key = nft_set_ext_key(&dup->ext); if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) dup_end = nft_set_ext_key_end(&dup->ext); else dup_end = dup_key; if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { *elem_priv = &dup->priv; return -EEXIST; } return -ENOTEMPTY; } /* Look for partially overlapping entries */ dup = pipapo_get(m, end, nft_genmask_next(net), tstamp); if (dup) { *elem_priv = &dup->priv; return -ENOTEMPTY; } /* Validate */ start_p = start; end_p = end; /* some helpers return -1, or 0 >= for valid rule pos, * so we cannot support more than INT_MAX rules at this time. */ BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); nft_pipapo_for_each_field(f, i, m) { if (f->rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; if (memcmp(start_p, end_p, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) return -EINVAL; start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } /* Insert */ bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { int ret; rulemap[i].to = f->rules; ret = memcmp(start, end, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); if (!ret) ret = pipapo_insert(f, start, f->groups * f->bb); else ret = pipapo_expand(f, start, end, f->groups * f->bb); if (ret < 0) return ret; if (f->bsize > bsize_max) bsize_max = f->bsize; rulemap[i].n = ret; start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { put_cpu_ptr(m->scratch); err = pipapo_realloc_scratch(m, bsize_max); if (err) return err; m->bsize_max = bsize_max; } else { put_cpu_ptr(m->scratch); } e = nft_elem_priv_cast(elem->priv); *elem_priv = &e->priv; pipapo_map(m, rulemap, e); return 0; } /** * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { struct nft_pipapo_field *dst, *src; struct nft_pipapo_match *new; int i; new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); if (!new) return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; new->scratch = alloc_percpu(*new->scratch); if (!new->scratch) goto out_scratch; for_each_possible_cpu(i) *per_cpu_ptr(new->scratch, i) = NULL; if (pipapo_realloc_scratch(new, old->bsize_max)) goto out_scratch_realloc; rcu_head_init(&new->rcu); src = old->f; dst = new->f; for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); if (lt_size < 0) goto out_lt; new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; dst->lt = new_lt; memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * src->groups * NFT_PIPAPO_BUCKETS(src->bb)); if (src->rules > 0) { if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) goto out_mt; dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL_ACCOUNT); if (!dst->mt) goto out_mt; memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); } else { dst->mt = NULL; dst->rules_alloc = 0; } src++; dst++; } return new; out_mt: kvfree(dst->lt); out_lt: for (dst--; i > 0; i--) { kvfree(dst->mt); kvfree(dst->lt); dst--; } out_scratch_realloc: for_each_possible_cpu(i) pipapo_free_scratch(new, i); out_scratch: free_percpu(new->scratch); kfree(new); return NULL; } /** * pipapo_rules_same_key() - Get number of rules originated from the same entry * @f: Field containing mapping table * @first: Index of first rule in set of rules mapping to same entry * * Using the fact that all rules in a field that originated from the same entry * will map to the same set of rules in the next field, or to the same element * reference, return the cardinality of the set of rules that originated from * the same entry as the rule with index @first, @first rule included. * * In pictures: * rules * field #0 0 1 2 3 4 * map to: 0 1 2-4 2-4 5-9 * . . ....... . ... * | | | | \ \ * | | | | \ \ * | | | | \ \ * ' ' ' ' ' \ * in field #1 0 1 2 3 4 5 ... * * if this is called for rule 2 on field #0, it will return 3, as also rules 2 * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. * * For the last field in a set, we can rely on associated entries to map to the * same element references. * * Return: Number of rules that originated from the same entry as @first. */ static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) { struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ unsigned int r; for (r = first; r < f->rules; r++) { if (r != first && e != f->mt[r].e) return r - first; e = f->mt[r].e; } if (r != first) return r - first; return 0; } /** * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones * @mt: Mapping array * @rules: Original amount of rules in mapping table * @start: First rule index to be removed * @n: Amount of rules to be removed * @to_offset: First rule index, in next field, this group of rules maps to * @is_last: If this is the last field, delete reference from mapping array * * This is used to unmap rules from the mapping table for a single field, * maintaining consistency and compactness for the existing ones. * * In pictures: let's assume that we want to delete rules 2 and 3 from the * following mapping array: * * rules * 0 1 2 3 4 * map to: 4-10 4-10 11-15 11-15 16-18 * * the result will be: * * rules * 0 1 2 * map to: 4-10 4-10 11-13 * * for fields before the last one. In case this is the mapping table for the * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: * * rules * 0 1 2 3 4 * element pointers: 0x42 0x42 0x33 0x33 0x44 * * the result will be: * * rules * 0 1 2 * element pointers: 0x42 0x42 0x44 */ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, unsigned int start, unsigned int n, unsigned int to_offset, bool is_last) { int i; memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); memset(mt + rules - n, 0, n * sizeof(*mt)); if (is_last) return; for (i = start; i < rules - n; i++) mt[i].to -= to_offset; } /** * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map * @m: Matching data * @rulemap: Table of rule maps, arrays of first rule and amount of rules * in next field a given entry maps to, for each field * * For each rule in lookup table buckets mapping to this set of rules, drop * all bits set in lookup table mapping. In pictures, assuming we want to drop * rules 0 and 1 from this lookup table: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 1,2 * 1 1,2 0 * 2 0 1,2 * 3 0 1,2 * 4 0,1,2 * 5 0 1 2 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 * * rule 2 becomes rule 0, and the result will be: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 0 * 1 0 * 2 0 * 3 0 * 4 0 * 5 0 * 6 0 * 7 0 0 * * once this is done, call unmap() to drop all the corresponding rule references * from mapping tables. */ static void pipapo_drop(struct nft_pipapo_match *m, union nft_pipapo_map_bucket rulemap[]) { struct nft_pipapo_field *f; int i; nft_pipapo_for_each_field(f, i, m) { int g; for (g = 0; g < f->groups; g++) { unsigned long *pos; int b; pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { bitmap_cut(pos, pos, rulemap[i].to, rulemap[i].n, f->bsize * BITS_PER_LONG); pos += f->bsize; } } pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, rulemap[i + 1].n, i == m->field_count - 1); if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { /* We can ignore this, a failure to shrink tables down * doesn't make tables invalid. */ ; } f->rules -= rulemap[i].n; pipapo_lt_bits_adjust(f); } } static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, struct nft_pipapo_elem *e) { nft_setelem_data_deactivate(net, set, &e->priv); } /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements * @set: nftables API set representation * @m: Matching data */ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); struct net *net = read_pnet(&set->net); unsigned int rules_f0, first_rule = 0; u64 tstamp = nft_net_tstamp(net); struct nft_pipapo_elem *e; struct nft_trans_gc *gc; gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const struct nft_pipapo_field *f; unsigned int i, start, rules_fx; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { rulemap[i].to = start; rulemap[i].n = rules_fx; if (i < m->field_count - 1) { rules_fx = f->mt[start].n; start = f->mt[start].to; } } /* Pick the last field, and its last index */ f--; i--; e = f->mt[rulemap[i].to].e; /* synchronous gc never fails, there is no need to set on * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. */ } else { first_rule += rules_f0; } } gc = nft_trans_gc_catchall_sync(gc); if (gc) { nft_trans_gc_queue_sync_done(gc); priv->last_gc = jiffies; } } /** * pipapo_free_fields() - Free per-field tables contained in matching data * @m: Matching data */ static void pipapo_free_fields(struct nft_pipapo_match *m) { struct nft_pipapo_field *f; int i; nft_pipapo_for_each_field(f, i, m) { kvfree(f->lt); kvfree(f->mt); } } static void pipapo_free_match(struct nft_pipapo_match *m) { int i; for_each_possible_cpu(i) pipapo_free_scratch(m, i); free_percpu(m->scratch); pipapo_free_fields(m); kfree(m); } /** * pipapo_reclaim_match - RCU callback to free fields from old matching data * @rcu: RCU head */ static void pipapo_reclaim_match(struct rcu_head *rcu) { struct nft_pipapo_match *m; m = container_of(rcu, struct nft_pipapo_match, rcu); pipapo_free_match(m); } /** * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working * copy before committing it for lookup, and don't replace the table if the * working copy doesn't have pending changes. * * We also need to create a new working copy for subsequent insertions and * deletions. */ static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *old; if (!priv->clone) return; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) pipapo_gc(set, priv->clone); old = rcu_replace_pointer(priv->match, priv->clone, nft_pipapo_transaction_mutex_held(set)); priv->clone = NULL; if (old) call_rcu(&old->rcu, pipapo_reclaim_match); } static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); if (!priv->clone) return; pipapo_free_match(priv->clone); priv->clone = NULL; } /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_clear(net, &e->ext); } /** * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ static struct nft_elem_priv * nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; /* removal must occur on priv->clone, if we are low on memory * we have no choice and must fail the removal request. */ if (!m) return NULL; e = pipapo_get(m, (const u8 *)elem->key.val.data, nft_genmask_next(net), nft_net_tstamp(net)); if (!e) return NULL; nft_set_elem_change_active(net, set, &e->ext); return &e->priv; } /** * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * This is functionally the same as nft_pipapo_deactivate(), with a slightly * different interface, and it's also called once for each element in a set * being flushed, so we can't implement, strictly speaking, a flush operation, * which would otherwise be as simple as allocating an empty copy of the * matching data. * * Note that we could in theory do that, mark the set as flushed, and ignore * subsequent calls, but we would leak all the elements after the first one, * because they wouldn't then be freed as result of API calls. * * Return: true if element was found and deactivated. */ static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &e->ext); } /** * pipapo_get_boundaries() - Get byte interval for associated rules * @f: Field including lookup table * @first_rule: First rule (lowest index) * @rule_count: Number of associated rules * @left: Byte expression for left boundary (start of range) * @right: Byte expression for right boundary (end of range) * * Given the first rule and amount of rules that originated from the same entry, * build the original range associated with the entry, and calculate the length * of the originating netmask. * * In pictures: * * bucket * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * 0 1,2 * 1 1,2 * 2 1,2 * 3 1,2 * 4 1,2 * 5 1 2 * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 * * this is the lookup table corresponding to the IPv4 range * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. * * This function fills @left and @right with the byte values of the leftmost * and rightmost bucket indices for the lowest and highest rule indices, * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in * nibbles: * left: < 12, 0, 10, 8, 0, 1, 0, 0 > * right: < 12, 0, 10, 8, 0, 2, 2, 1 > * corresponding to bytes: * left: < 192, 168, 1, 0 > * right: < 192, 168, 2, 1 > * with mask length irrelevant here, unused on return, as the range is already * defined by its start and end points. The mask length is relevant for a single * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes * < 192, 168, 1, 255 >, and the mask length, calculated from the distances * between leftmost and rightmost bucket indices for each group, would be 24. * * Return: mask length, in bits. */ static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, int rule_count, u8 *left, u8 *right) { int g, mask_len = 0, bit_offset = 0; u8 *l = left, *r = right; for (g = 0; g < f->groups; g++) { int b, x0, x1; x0 = -1; x1 = -1; for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { unsigned long *pos; pos = NFT_PIPAPO_LT_ALIGN(f->lt) + (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; if (test_bit(first_rule, pos) && x0 == -1) x0 = b; if (test_bit(first_rule + rule_count - 1, pos)) x1 = b; } *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); bit_offset += f->bb; if (bit_offset >= BITS_PER_BYTE) { bit_offset %= BITS_PER_BYTE; l++; r++; } if (x1 - x0 == 0) mask_len += 4; else if (x1 - x0 == 1) mask_len += 3; else if (x1 - x0 == 3) mask_len += 2; else if (x1 - x0 == 7) mask_len += 1; } return mask_len; } /** * pipapo_match_field() - Match rules against byte ranges * @f: Field including the lookup table * @first_rule: First of associated rules originating from same entry * @rule_count: Amount of associated rules * @start: Start of range to be matched * @end: End of range to be matched * * Return: true on match, false otherwise. */ static bool pipapo_match_field(struct nft_pipapo_field *f, int first_rule, int rule_count, const u8 *start, const u8 *end) { u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; pipapo_get_boundaries(f, first_rule, rule_count, left, right); return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); } /** * nft_pipapo_remove() - Remove element given key, commit * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data * * Similarly to nft_pipapo_activate(), this is used as commit operation by the * API, but it's called once per element in the pending transaction, so we can't * implement this as a single commit operation. Closest we can get is to remove * the matched element here, if any, and commit the updated matching data. */ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; unsigned int rules_f0, first_rule = 0; struct nft_pipapo_elem *e; const u8 *data; e = nft_elem_priv_cast(elem_priv); data = (const u8 *)nft_set_ext_key(&e->ext); while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; struct nft_pipapo_field *f; int i, start, rules_fx; match_start = data; if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; else match_end = data; start = first_rule; rules_fx = rules_f0; nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; if (!pipapo_match_field(f, start, rules_fx, match_start, match_end)) break; rulemap[i].to = start; rulemap[i].n = rules_fx; rules_fx = f->mt[start].n; start = f->mt[start].to; match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); if (last && f->mt[rulemap[i].to].e == e) { pipapo_drop(m, rulemap); return; } } first_rule += rules_f0; } WARN_ON_ONCE(1); /* elem_priv not found */ } /** * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first * field. @m is protected either by RCU read lock or by transaction mutex. */ static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_pipapo_match *m, struct nft_set_iter *iter) { const struct nft_pipapo_field *f; unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; if (iter->count < iter->skip) goto cont; e = f->mt[r].e; iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) return; cont: iter->count++; } } /** * nft_pipapo_walk() - Walk over elements * @ctx: nftables API context * @set: nftables API set representation * @iter: Iterator * * Test if destructive action is needed or not, clone active backend if needed * and call the real function to work on the data. */ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); const struct nft_pipapo_match *m; switch (iter->type) { case NFT_ITER_UPDATE: m = pipapo_maybe_clone(set); if (!m) { iter->err = -ENOMEM; return; } nft_pipapo_do_walk(ctx, set, m, iter); break; case NFT_ITER_READ: rcu_read_lock(); m = rcu_dereference(priv->match); nft_pipapo_do_walk(ctx, set, m, iter); rcu_read_unlock(); break; default: iter->err = -EINVAL; WARN_ON_ONCE(1); break; } } /** * nft_pipapo_privsize() - Return the size of private data for the set * @nla: netlink attributes, ignored as size doesn't depend on them * @desc: Set description, ignored as size doesn't depend on it * * Return: size of private data for this set implementation, in bytes */ static u64 nft_pipapo_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_pipapo); } /** * nft_pipapo_estimate() - Set size, space and lookup complexity * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * * Return: true if set description is compatible, false otherwise */ static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (!(features & NFT_SET_INTERVAL) || desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; est->size = pipapo_estimate_size(desc); if (!est->size) return false; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } /** * nft_pipapo_init() - Initialise data for a set instance * @set: nftables API set representation * @desc: Set description * @nla: netlink attributes * * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink * attributes, initialise internal set parameters, current instance of matching * data and a copy for subsequent insertions. * * Return: 0 on success, negative error code on failure. */ static int nft_pipapo_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int err, i, field_count; BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); field_count = desc->field_count ? : 1; BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); if (!m) return -ENOMEM; m->field_count = field_count; m->bsize_max = 0; m->scratch = alloc_percpu(struct nft_pipapo_scratch *); if (!m->scratch) { err = -ENOMEM; goto out_scratch; } for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { unsigned int len = desc->field_len[i] ? : set->klen; /* f->groups is u8 */ BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); f->bb = NFT_PIPAPO_GROUP_BITS_INIT; f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); priv->width += round_up(len, sizeof(u32)); f->bsize = 0; f->rules = 0; f->rules_alloc = 0; f->lt = NULL; f->mt = NULL; } rcu_assign_pointer(priv->match, m); return 0; out_scratch: kfree(m); return err; } /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; unsigned int i, r; for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; for (r = 0; r < f->rules; r++) { struct nft_pipapo_elem *e; if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) continue; e = f->mt[r].e; nf_tables_set_elem_destroy(ctx, set, &e->priv); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements * @ctx: context * @set: nftables API set representation */ static void nft_pipapo_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; m = rcu_dereference_protected(priv->match, true); if (priv->clone) { nft_set_pipapo_match_destroy(ctx, set, priv->clone); pipapo_free_match(priv->clone); priv->clone = NULL; } else { nft_set_pipapo_match_destroy(ctx, set, m); } pipapo_free_match(m); } /** * nft_pipapo_gc_init() - Initialise garbage collection * @set: nftables API set representation * * Instead of actually setting up a periodic work for garbage collection, as * this operation requires a swap of matching data with the working copy, we'll * do that opportunistically with other commit operations if the interval is * elapsed, so we just need to set the current jiffies timestamp here. */ static void nft_pipapo_gc_init(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); priv->last_gc = jiffies; } const struct nft_set_type nft_set_pipapo_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .lookup = nft_pipapo_lookup, .insert = nft_pipapo_insert, .activate = nft_pipapo_activate, .deactivate = nft_pipapo_deactivate, .flush = nft_pipapo_flush, .remove = nft_pipapo_remove, .walk = nft_pipapo_walk, .get = nft_pipapo_get, .privsize = nft_pipapo_privsize, .estimate = nft_pipapo_estimate, .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) const struct nft_set_type nft_set_pipapo_avx2_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .lookup = nft_pipapo_avx2_lookup, .insert = nft_pipapo_insert, .activate = nft_pipapo_activate, .deactivate = nft_pipapo_deactivate, .flush = nft_pipapo_flush, .remove = nft_pipapo_remove, .walk = nft_pipapo_walk, .get = nft_pipapo_get, .privsize = nft_pipapo_privsize, .estimate = nft_pipapo_avx2_estimate, .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; #endif
2362 26 2087 2079 2089 2082 9 9 9 6 3 5 4 9 9 9 9 9 1115 1115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return 0; } /* * OK, there are some permission event watchers. Check if anybody is * watching for permission events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, ALL_FSNOTIFY_PERM_EVENTS); if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask |= fsnotify_inode_watches_children(d_inode(parent)); dput(parent); } /* * Legacy FAN_ACCESS_PERM events have very high performance overhead, * so unlikely to be used in the wild. If they are used there will be * no optimizations at all. */ if (unlikely(p_mask & FS_ACCESS_PERM)) { /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); goto open_perm; } /* * Pre-content events are only supported on regular files. * If there are pre-content event watchers and no permission access * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. * That is the common case with HSM service. */ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); goto open_perm; } /* Nobody watching permission and pre-content events on this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); open_perm: /* * Send open perm events depending on object masks and regardless of * FMODE_NONOTIFY_PERM. */ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } if (p_mask & FS_OPEN_PERM) return fsnotify_path(&file->f_path, FS_OPEN_PERM); return 0; } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
34 35 3 3 3 3 3 3 1 2 2 2 1 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 2 2 2 4 3 1 1 16 8 2 1 4 1 2 2 1 2 2 8 1 4 2 3 1 5 3 1 1 1 1 4 3 1 1 2 2 2 2 15 15 3 2 4 4 2 2 18 10 2 7 4 3 5 1 3 3 4 4 3 2 2 3 2 4 3 2 3 2 2 2 1 15 40 8 2 2 7 15 28 1 10 7 3 3 16 3 2 3 5 5 8 2 1 1 8 2 1 1 1 3 1 1 13 6 4 1 2 7 2 5 1 1 1 1 1 18 9 1 10 8 7 2 16 7 2 2 1 3 3 1 4 2 1 2 2 7 1 1 2 2 3 1 15 1 4 2 7 1 9 3 2 2 2 8 5 2 3 1 1 1 10 8 2 9 8 1 1 10 10 13 13 13 10 10 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 /* * Copyright (c) 2017 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" #include "restrack.h" #include "uverbs.h" /* * This determines whether a non-privileged user is allowed to specify a * controlled QKEY or not, when true non-privileged user is allowed to specify * a controlled QKEY. */ static bool privileged_qkey; typedef int (*res_fill_func_t)(struct sk_buff*, bool, struct rdma_restrack_entry*, uint32_t); /* * Sort array elements by the netlink attribute name */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type) { if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name)) return -EMSGSIZE; if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC && nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u32 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value)) return -EMSGSIZE; return 0; } static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, enum rdma_nldev_print_type print_type, u64 value) { if (put_driver_name_print_type(msg, name, print_type)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; return 0; } int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name, const char *str) { if (put_driver_name_print_type(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str)) return -EMSGSIZE; return 0; } EXPORT_SYMBOL(rdma_nl_put_driver_string); int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32); int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex); int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64); int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) { return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, value); } EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); bool rdma_nl_get_privileged_qkey(void) { return privileged_qkey; } EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev))) return -EMSGSIZE; return 0; } static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; int ret = 0; u32 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, device->attrs.device_cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; ib_get_device_fw_str(device, fw); /* Device without FW has strlen(fw) = 0 */ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, be64_to_cpu(device->node_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, be64_to_cpu(device->attrs.sys_image_guid), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) return -EMSGSIZE; if (device->type && nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) return -EMSGSIZE; if (device->parent && nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, dev_name(&device->parent->dev))) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, device->name_assign_type)) return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device * which can potentially have two different link type for the same * IB device is considered as better to be avoided in the future, */ port = rdma_start_port(device); if (rdma_cap_opa_mad(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); else if (rdma_protocol_ib(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); else if (rdma_protocol_iwarp(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); else if (rdma_protocol_roce(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); else if (rdma_protocol_usnic(device, port)) ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "usnic"); return ret; } static int fill_port_info(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = NULL; struct ib_port_attr attr; int ret; u64 cap_flags = 0; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; ret = ib_query_port(device, port, &attr); if (ret) return ret; if (rdma_protocol_ib(device, port)) { BUILD_BUG_ON((sizeof(attr.port_cap_flags) + sizeof(attr.port_cap_flags2)) > sizeof(u64)); cap_flags = attr.port_cap_flags | ((u64)attr.port_cap_flags2 << 32); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) return -EMSGSIZE; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static int fill_res_info_entry(struct sk_buff *msg, const char *name, u64 curr) { struct nlattr *entry_attr; entry_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_info(struct sk_buff *msg, struct ib_device *device, bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", [RDMA_RESTRACK_CQ] = "cq", [RDMA_RESTRACK_QP] = "qp", [RDMA_RESTRACK_CM_ID] = "cm_id", [RDMA_RESTRACK_MR] = "mr", [RDMA_RESTRACK_CTX] = "ctx", [RDMA_RESTRACK_SRQ] = "srq", }; struct nlattr *table_attr; int ret, i, curr; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); if (!table_attr) return -EMSGSIZE; for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; } nla_nest_end(msg, table_attr); return 0; err: nla_nest_cancel(msg, table_attr); return ret; } static int fill_res_name_pid(struct sk_buff *msg, struct rdma_restrack_entry *res) { int err = 0; /* * For user resources, user is should read /proc/PID/comm to get the * name of the task file. */ if (rdma_is_kernel_res(res)) { err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name); } else { pid_t pid; pid = task_pid_vnr(res->task); /* * Task is dead and in zombie state. * There is no need to print PID anymore. */ if (pid) /* * This part is racy, task can be killed and PID will * be zero right here but it is ok, next query won't * return PID. We don't promise real-time reflection * of SW objects. */ err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid); } return err ? -EMSGSIZE : 0; } static int fill_res_qp_entry_query(struct sk_buff *msg, struct rdma_restrack_entry *res, struct ib_device *dev, struct ib_qp *qp) { struct ib_qp_init_attr qp_init_attr; struct ib_qp_attr qp_attr; int ret; ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); if (ret) return ret; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, qp_attr.rq_psn)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, qp_attr.path_mig_state)) goto err; } if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; if (dev->ops.fill_res_qp_entry) return dev->ops.fill_res_qp_entry(msg, qp); return 0; err: return -EMSGSIZE; } static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; int ret; if (port && port != qp->port) return -EAGAIN; /* In create_qp() port is not set yet */ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) return -EMSGSIZE; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); if (ret) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) return -EMSGSIZE; ret = fill_res_name_pid(msg, res); if (ret) return -EMSGSIZE; return fill_res_qp_entry_query(msg, res, dev, qp); } static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); struct ib_device *dev = qp->device; if (port && port != qp->port) return -EAGAIN; if (!dev->ops.fill_res_qp_entry_raw) return -EINVAL; return dev->ops.fill_res_qp_entry_raw(msg, qp); } static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; if (port && port != cm_id->port_num) return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) goto err; if (id_priv->qp_num) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) goto err; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) goto err; if (cm_id->route.addr.src_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR, sizeof(cm_id->route.addr.src_addr), &cm_id->route.addr.src_addr)) goto err; if (cm_id->route.addr.dst_addr.ss_family && nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR, sizeof(cm_id->route.addr.dst_addr), &cm_id->route.addr.dst_addr)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_cm_id_entry) return dev->ops.fill_res_cm_id_entry(msg, cm_id); return 0; err: return -EMSGSIZE; } static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, cq->uobject->uevent.uobject.context->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_cq_entry) ? dev->ops.fill_res_cq_entry(msg, cq) : 0; } static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); struct ib_device *dev = cq->device; if (!dev->ops.fill_res_cq_entry_raw) return -EINVAL; return dev->ops.fill_res_cq_entry_raw(msg, cq); } static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) return -EMSGSIZE; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) return -EMSGSIZE; if (fill_res_name_pid(msg, res)) return -EMSGSIZE; return (dev->ops.fill_res_mr_entry) ? dev->ops.fill_res_mr_entry(msg, mr) : 0; } static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (!dev->ops.fill_res_mr_entry_raw) return -EINVAL; return dev->ops.fill_res_mr_entry_raw(msg, mr); } static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, pd->unsafe_global_rkey)) goto err; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) goto err; if (!rdma_is_kernel_res(res) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, pd->uobject->context->res.id)) goto err; return fill_res_name_pid(msg, res); err: return -EMSGSIZE; } static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); if (rdma_is_kernel_res(res)) return 0; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) return -EMSGSIZE; return fill_res_name_pid(msg, res); } static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, uint32_t max_range) { struct nlattr *entry_attr; if (!min_range) return 0; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (min_range == max_range) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) goto err; } else { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) goto err; } nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) { uint32_t min_range = 0, prev = 0; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &srq->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; qp = container_of(res, struct ib_qp, res); if (!qp->srq || (qp->srq->res.id != srq->res.id)) { rdma_restrack_put(res); continue; } if (qp->qp_num < prev) /* qp_num should be ascending */ goto err_loop; if (min_range == 0) { min_range = qp->qp_num; } else if (qp->qp_num > (prev + 1)) { if (fill_res_range_qp_entry(msg, min_range, prev)) goto err_loop; min_range = qp->qp_num; } prev = qp->qp_num; rdma_restrack_put(res); } xa_unlock(&rt->xa); if (fill_res_range_qp_entry(msg, min_range, prev)) goto err; nla_nest_end(msg, table_attr); return 0; err_loop: rdma_restrack_put(res); xa_unlock(&rt->xa); err: nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) goto err; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) goto err; if (ib_srq_has_cq(srq->srq_type)) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, srq->ext.cq->res.id)) goto err; } if (fill_res_srq_qps(msg, srq)) goto err; if (fill_res_name_pid(msg, res)) goto err; if (dev->ops.fill_res_srq_entry) return dev->ops.fill_res_srq_entry(msg, srq); return 0; err: return -EMSGSIZE; } static int fill_res_srq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_srq *srq = container_of(res, struct ib_srq, res); struct ib_device *dev = srq->device; if (!dev->ops.fill_res_srq_entry_raw) return -EINVAL; return dev->ops.fill_res_srq_entry_raw(msg, srq); } static int fill_stat_counter_mode(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_counter_mode *m = &counter->mode; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) return -EMSGSIZE; if (m->mode == RDMA_COUNTER_MODE_AUTO) { if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) return -EMSGSIZE; if ((m->mask & RDMA_COUNTER_MASK_PID) && fill_res_name_pid(msg, &counter->res)) return -EMSGSIZE; } return 0; } static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } static int fill_stat_counter_qps(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct nlattr *table_attr; struct ib_qp *qp = NULL; unsigned long id = 0; int ret = 0; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); if (!table_attr) return -EMSGSIZE; rt = &counter->device->res[RDMA_RESTRACK_QP]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { qp = container_of(res, struct ib_qp, res); if (!qp->counter || (qp->counter->id != counter->id)) continue; ret = fill_stat_counter_qp_entry(msg, qp->qp_num); if (ret) goto err; } xa_unlock(&rt->xa); nla_nest_end(msg, table_attr); return 0; err: xa_unlock(&rt->xa); nla_nest_cancel(msg, table_attr); return ret; } int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name, u64 value) { struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry_attr) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, name)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, value, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); return 0; err: nla_nest_cancel(msg, entry_attr); return -EMSGSIZE; } EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry); static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); struct ib_device *dev = mr->pd->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; if (dev->ops.fill_stat_mr_entry) return dev->ops.fill_stat_mr_entry(msg, mr); return 0; err: return -EMSGSIZE; } static int fill_stat_counter_hwcounters(struct sk_buff *msg, struct rdma_counter *counter) { struct rdma_hw_stats *st = counter->stats; struct nlattr *table_attr; int i; table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) return -EMSGSIZE; mutex_lock(&st->lock); for (i = 0; i < st->num_counters; i++) { if (test_bit(i, st->is_disabled)) continue; if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, st->value[i])) goto err; } mutex_unlock(&st->lock); nla_nest_end(msg, table_attr); return 0; err: mutex_unlock(&st->lock); nla_nest_cancel(msg, table_attr); return -EMSGSIZE; } static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_counter *counter = container_of(res, struct rdma_counter, res); if (port && port != counter->port) return -EAGAIN; /* Dump it even query failed */ rdma_counter_query_stats(counter); if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || fill_stat_counter_mode(msg, counter) || fill_stat_counter_qps(msg, counter) || fill_stat_counter_hwcounters(msg, counter)) return -EMSGSIZE; return 0; } static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_dev_info(msg, device); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); if (strlen(name) == 0) { err = -EINVAL; goto done; } err = ib_device_rename(device, name); goto done; } if (tb[RDMA_NLDEV_NET_NS_FD]) { u32 ns_fd; ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); err = ib_device_set_netns_put(skb, device, ns_fd); goto put_done; } if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { u8 use_dim; use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); err = ib_device_set_dim(device, use_dim); goto done; } done: ib_device_put(device); put_done: return err; } static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* * There is no need to take lock, because * we are relying on ib_core's locking. */ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); } static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index; u32 port; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { err = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); if (!nlh) { err = -EMSGSIZE; goto err_free; } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return err; } static int nldev_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; int start = cb->args[0]; struct nlmsghdr *nlh; u32 idx = 0; u32 ifindex; int err; unsigned int p; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), ifindex); if (!device) return -EINVAL; rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink * messages request sent by user and it is available * in cb->args[0]. * * Usually, the user doesn't fill this field and it causes * to return everything. * */ if (idx < start) { idx++; continue; } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } idx++; nlmsg_end(skb, nlh); } out: ib_device_put(device); cb->args[0] = idx; return skb->len; } static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_free; } ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int _nldev_res_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx) { int start = cb->args[0]; struct nlmsghdr *nlh; if (idx < start) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } nlmsg_end(skb, nlh); idx++; out: cb->args[0] = idx; return skb->len; } static int nldev_res_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); } struct nldev_fill_res_entry { enum rdma_nldev_attr nldev_attr; u8 flags; u32 entry; u32 id; }; enum nldev_res_flags { NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, [RDMA_RESTRACK_COUNTER] = { .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, }, [RDMA_RESTRACK_CTX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, .id = RDMA_NLDEV_ATTR_RES_CTXN, }, [RDMA_RESTRACK_SRQ] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, .id = RDMA_NLDEV_ATTR_RES_SRQN, }, }; static noinline_for_stack int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct ib_device *device; u32 index, id, port = 0; bool has_cap_net_admin; struct sk_buff *msg; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } } if ((port && fe->flags & NLDEV_PER_DEV) || (!port && ~fe->flags & NLDEV_PER_DEV)) { ret = -EINVAL; goto err; } id = nla_get_u32(tb[fe->id]); res = rdma_restrack_get_byid(device, res_type, id); if (IS_ERR(res)) { ret = PTR_ERR(res); goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_get; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(nlh->nlmsg_type)), 0, 0); if (!nlh || fill_nldev_handle(msg, device)) { ret = -EMSGSIZE; goto err_free; } has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); ret = fill_func(msg, has_cap_net_admin, res, port); if (ret) goto err_free; rdma_restrack_put(res); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); err_get: rdma_restrack_put(res); err: ib_device_put(device); return ret; } static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, enum rdma_restrack_type res_type, res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; bool show_details = false; struct nlattr *table_attr; struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; bool has_cap_net_admin; struct nlmsghdr *nlh; unsigned long id; u32 index, port = 0; bool filled = false; err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. * if it doesn't exist, we will iterate over all devices. * * But it is not needed for now. */ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_index; } } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); if (!table_attr) { ret = -EMSGSIZE; goto err; } has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); rt = &device->res[res_type]; xa_lock(&rt->xa); /* * FIXME: if the skip ahead is something common this loop should * use xas_for_each & xas_pause to optimize, we can have a lot of * objects. */ xa_for_each(&rt->xa, id, res) { if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) goto next; if (idx < start || !rdma_restrack_get(res)) goto next; xa_unlock(&rt->xa); filled = true; entry_attr = nla_nest_start_noflag(skb, fe->entry); if (!entry_attr) { ret = -EMSGSIZE; rdma_restrack_put(res); goto msg_full; } ret = fill_func(skb, has_cap_net_admin, res, port); rdma_restrack_put(res); if (ret) { nla_nest_cancel(skb, entry_attr); if (ret == -EMSGSIZE) goto msg_full; if (ret == -EAGAIN) goto again; goto res_err; } nla_nest_end(skb, entry_attr); again: xa_lock(&rt->xa); next: idx++; } xa_unlock(&rt->xa); msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; /* * No more entries to fill, cancel the message and * return 0 to mark end of dumpit. */ if (!filled) goto err; ib_device_put(device); return skb->len; res_err: nla_nest_cancel(skb, table_attr); err: nlmsg_cancel(skb, nlh); err_index: ib_device_put(device); return ret; } #define RES_GET_FUNCS(name, type) \ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ struct netlink_callback *cb) \ { \ return res_get_common_dumpit(skb, cb, type, \ fill_res_##name##_entry); \ } \ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ struct nlmsghdr *nlh, \ struct netlink_ext_ack *extack) \ { \ return res_get_common_doit(skb, nlh, extack, type, \ fill_res_##name##_entry); \ } RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); RES_GET_FUNCS(srq_raw, RDMA_RESTRACK_SRQ); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); static const struct rdma_link_ops *link_ops_get(const char *type) { const struct rdma_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->type, type)) goto out; } ops = NULL; out: return ops; } void rdma_link_register(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); if (WARN_ON_ONCE(link_ops_get(ops->type))) goto out; list_add(&ops->list, &link_ops); out: up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_register); void rdma_link_unregister(struct rdma_link_ops *ops) { down_write(&link_ops_rwsem); list_del(&ops->list); up_write(&link_ops_rwsem); } EXPORT_SYMBOL(rdma_link_unregister); static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char ibdev_name[IB_DEVICE_NAME_MAX]; const struct rdma_link_ops *ops; char ndev_name[IFNAMSIZ]; struct net_device *ndev; char type[IFNAMSIZ]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(ibdev_name)); if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) return -EINVAL; nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); if (!ndev) return -ENODEV; down_read(&link_ops_rwsem); ops = link_ops_get(type); #ifdef CONFIG_MODULES if (!ops) { up_read(&link_ops_rwsem); request_module("rdma-link-%s", type); down_read(&link_ops_rwsem); ops = link_ops_get(type); } #endif err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; up_read(&link_ops_rwsem); dev_put(ndev); return err; } static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 index; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { ib_device_put(device); return -EINVAL; } ib_unregister_device_and_put(device); return 0; } static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; struct ib_client_nl_info data = {}; struct ib_device *ibdev = NULL; struct sk_buff *msg; u32 index; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); ibdev = ib_device_get_by_index(sock_net(skb->sk), index); if (!ibdev) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(ibdev, data.port)) { err = -EINVAL; goto out_put; } } else { data.port = -1; } } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out_put; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0, 0); if (!nlh) { err = -EMSGSIZE; goto out_nlmsg; } data.nl_msg = msg; err = ib_get_client_nl_info(ibdev, client_name, &data); if (err) goto out_nlmsg; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, huge_encode_dev(data.cdev->devt), RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, RDMA_NLDEV_ATTR_PAD); if (err) goto out_data; if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, dev_name(data.cdev))) { err = -EMSGSIZE; goto out_data; } nlmsg_end(msg, nlh); put_device(data.cdev); if (ibdev) ib_device_put(ibdev); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); out_data: put_device(data.cdev); out_nlmsg: nlmsg_free(msg); out_put: if (ibdev) ib_device_put(ibdev); return err; } static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct sk_buff *msg; int err; err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SYS_GET), 0, 0); if (!nlh) { nlmsg_free(msg); return -EMSGSIZE; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, (u8)ib_devices_shared_netns); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, (u8)privileged_qkey); if (err) { nlmsg_free(msg); return err; } err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); if (err) { nlmsg_free(msg); return err; } /* * Copy-on-fork is supported. * See commits: * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") * for more details. Don't backport this without them. * * Return value ignored on purpose, assume copy-on-fork is not * supported in case of failure. */ nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); nlmsg_end(msg, nlh); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); } static int nldev_set_sys_set_netns_doit(struct nlattr *tb[]) { u8 enable; int err; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; err = rdma_compatdev_set(enable); return err; } static int nldev_set_sys_set_pqkey_doit(struct nlattr *tb[]) { u8 enable; enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]); /* Only 0 and 1 are supported */ if (enable > 1) return -EINVAL; privileged_qkey = enable; return 0; } static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int err; err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (err) return -EINVAL; if (tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) return nldev_set_sys_set_netns_doit(tb); if (tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]) return nldev_set_sys_set_pqkey_doit(tb); return -EINVAL; } static int nldev_stat_set_mode_doit(struct sk_buff *msg, struct netlink_ext_ack *extack, struct nlattr *tb[], struct ib_device *device, u32 port) { u32 mode, mask = 0, qpn, cntn = 0; bool opcnt = false; int ret; /* Currently only counter for QP is supported */ if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) opcnt = !!nla_get_u8( tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); if (mode == RDMA_COUNTER_MODE_AUTO) { if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) mask = nla_get_u32( tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); return rdma_counter_set_auto_mode(device, port, mask, opcnt, extack); } if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); ret = rdma_counter_bind_qpn(device, port, qpn, cntn); if (ret) return ret; } else { ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); if (ret) return ret; } if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } return 0; err_fill: rdma_counter_unbind_qpn(device, port, qpn, cntn); return ret; } static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], struct ib_device *device, u32 port) { struct rdma_hw_stats *stats; struct nlattr *entry_attr; unsigned long *target; int rem, i, ret = 0; u32 index; stats = ib_get_hw_stats_port(device, port); if (!stats) return -EINVAL; target = kcalloc(BITS_TO_LONGS(stats->num_counters), sizeof(*stats->is_disabled), GFP_KERNEL); if (!target) return -ENOMEM; nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], rem) { index = nla_get_u32(entry_attr); if ((index >= stats->num_counters) || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { ret = -EINVAL; goto out; } set_bit(index, target); } for (i = 0; i < stats->num_counters; i++) { if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) continue; ret = rdma_counter_modify(device, port, i, test_bit(i, target)); if (ret) goto out; } out: kfree(target); return ret; } static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err_put_device; } if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = -EINVAL; goto err_put_device; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_put_device; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); if (ret) goto err_free_msg; } if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); if (ret) goto err_free_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free_msg: nlmsg_free(msg); err_put_device: ib_device_put(device); return ret; } static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; struct sk_buff *msg; u32 index, port, qpn, cntn; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || !tb[RDMA_NLDEV_ATTR_RES_LQPN]) return -EINVAL; if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_SET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_fill; } cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { ret = -EMSGSIZE; goto err_fill; } ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); if (ret) goto err_fill; nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_fill: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { struct rdma_hw_stats *stats; struct nlattr *table_attr; struct ib_device *device; int ret, num_cnts, i; struct sk_buff *msg; u32 index, port; u64 v; if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { ret = -EINVAL; goto err; } port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { ret = -EMSGSIZE; goto err_msg; } mutex_lock(&stats->lock); num_cnts = device->ops.get_hw_stats(device, stats, port, 0); if (num_cnts < 0) { ret = -EINVAL; goto err_stats; } table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table_attr) { ret = -EMSGSIZE; goto err_stats; } for (i = 0; i < num_cnts; i++) { if (test_bit(i, stats->is_disabled)) continue; v = stats->value[i] + rdma_counter_get_hwstat_value(device, port, i); if (rdma_nl_stat_hwcounter_entry(msg, stats->descs[i].name, v)) { ret = -EMSGSIZE; goto err_table; } } nla_nest_end(msg, table_attr); mutex_unlock(&stats->lock); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_table: nla_nest_cancel(msg, table_attr); err_stats: mutex_unlock(&stats->lock); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static noinline_for_stack int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr *tb[]) { static enum rdma_nl_counter_mode mode; static enum rdma_nl_counter_mask mask; struct ib_device *device; struct sk_buff *msg; u32 index, port; bool opcnt; int ret; if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) return nldev_res_get_counter_doit(skb, nlh, extack); if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET), 0, 0); if (!nlh) { ret = -EMSGSIZE; goto err_msg; } ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); if (ret) goto err_msg; if (fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { ret = -EMSGSIZE; goto err_msg; } if ((mode == RDMA_COUNTER_MODE_AUTO) && nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { ret = -EMSGSIZE; goto err_msg; } nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) return stat_get_doit_default_counter(skb, nlh, extack, tb); switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = stat_get_doit_qp(skb, nlh, extack, tb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { case RDMA_NLDEV_ATTR_RES_QP: ret = nldev_res_get_counter_dumpit(skb, cb); break; case RDMA_NLDEV_ATTR_RES_MR: ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR, fill_stat_mr_entry); break; default: ret = -EINVAL; break; } return ret; } static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; struct rdma_hw_stats *stats; struct ib_device *device; struct sk_buff *msg; u32 devid, port; int ret, i; ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); if (!rdma_is_port_valid(device, port)) { ret = -EINVAL; goto err; } stats = ib_get_hw_stats_port(device, port); if (!stats) { ret = -EINVAL; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } nlh = nlmsg_put( msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), 0, 0); ret = -EMSGSIZE; if (!nlh || fill_nldev_handle(msg, device) || nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) goto err_msg; table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); if (!table) goto err_msg; mutex_lock(&stats->lock); for (i = 0; i < stats->num_counters; i++) { entry = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); if (!entry) goto err_msg_table; if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, stats->descs[i].name) || nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) goto err_msg_entry; if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, !test_bit(i, stats->is_disabled)))) goto err_msg_entry; nla_nest_end(msg, entry); } mutex_unlock(&stats->lock); nla_nest_end(msg, table); nlmsg_end(msg, nlh); ib_device_put(device); return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_msg_entry: nla_nest_cancel(msg, entry); err_msg_table: mutex_unlock(&stats->lock); nla_nest_cancel(msg, table); err_msg: nlmsg_free(msg); err: ib_device_put(device); return ret; } static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; enum rdma_nl_dev_type type; struct ib_device *parent; char name[IFNAMSIZ] = {}; u32 parentid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) return -EINVAL; nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); parent = ib_device_get_by_index(sock_net(skb->sk), parentid); if (!parent) return -EINVAL; ret = ib_add_sub_device(parent, type, name); ib_device_put(parent); return ret; } static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct ib_device *device; u32 devid; int ret; ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); device = ib_device_get_by_index(sock_net(skb->sk), devid); if (!device) return -EINVAL; return ib_del_sub_device_and_put(device); } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, [RDMA_NLDEV_CMD_GET_CHARDEV] = { .doit = nldev_get_chardev, }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_NEWLINK] = { .doit = nldev_newlink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELLINK] = { .doit = nldev_dellink, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, [RDMA_NLDEV_CMD_RES_GET] = { .doit = nldev_res_get_doit, .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, [RDMA_NLDEV_CMD_RES_CTX_GET] = { .doit = nldev_res_get_ctx_doit, .dump = nldev_res_get_ctx_dumpit, }, [RDMA_NLDEV_CMD_RES_SRQ_GET] = { .doit = nldev_res_get_srq_doit, .dump = nldev_res_get_srq_dumpit, }, [RDMA_NLDEV_CMD_SYS_GET] = { .doit = nldev_sys_get_doit, }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_SET] = { .doit = nldev_stat_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET] = { .doit = nldev_stat_get_doit, .dump = nldev_stat_get_dumpit, }, [RDMA_NLDEV_CMD_STAT_DEL] = { .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { .doit = nldev_res_get_qp_raw_doit, .dump = nldev_res_get_qp_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { .doit = nldev_res_get_cq_raw_doit, .dump = nldev_res_get_cq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { .doit = nldev_res_get_mr_raw_doit, .dump = nldev_res_get_mr_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_RES_SRQ_GET_RAW] = { .doit = nldev_res_get_srq_raw_doit, .dump = nldev_res_get_srq_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { .doit = nldev_stat_get_counter_status_doit, }, [RDMA_NLDEV_CMD_NEWDEV] = { .doit = nldev_newdev, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_DELDEV] = { .doit = nldev_deldev, .flags = RDMA_NL_ADMIN_PERM, }, }; static int fill_mon_netdev_rename(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (!netdev || !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); out: dev_put(netdev); return ret; } static int fill_mon_netdev_association(struct sk_buff *msg, struct ib_device *device, u32 port, const struct net *net) { struct net_device *netdev = ib_device_get_netdev(device, port); int ret = 0; if (netdev && !net_eq(dev_net(netdev), net)) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, dev_name(&device->dev)); if (ret) goto out; ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); if (ret) goto out; if (netdev) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); if (ret) goto out; ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); } out: dev_put(netdev); return ret; } static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct net_device *netdev; switch (type) { case RDMA_REGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor register device event\n"); break; case RDMA_UNREGISTER_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor unregister device event\n"); break; case RDMA_NETDEV_ATTACH_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; case RDMA_NETDEV_DETACH_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev detach event: port %d\n", port_num); break; case RDMA_RENAME_EVENT: dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor rename device event\n"); break; case RDMA_NETDEV_RENAME_EVENT: netdev = ib_device_get_netdev(device, port_num); dev_warn_ratelimited(&device->dev, "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", port_num, netdev->ifindex); dev_put(netdev); break; default: break; } } int rdma_nl_notify_event(struct ib_device *device, u32 port_num, enum rdma_nl_notify_event_type type) { struct sk_buff *skb; int ret = -EMSGSIZE; struct net *net; void *nlh; net = read_pnet(&device->coredev.rdma_net); if (!net) return -EINVAL; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), 0, 0); if (!nlh) goto err_free; switch (type) { case RDMA_REGISTER_EVENT: case RDMA_UNREGISTER_EVENT: case RDMA_RENAME_EVENT: ret = fill_nldev_handle(skb, device); if (ret) goto err_free; break; case RDMA_NETDEV_ATTACH_EVENT: case RDMA_NETDEV_DETACH_EVENT: ret = fill_mon_netdev_association(skb, device, port_num, net); if (ret) goto err_free; break; case RDMA_NETDEV_RENAME_EVENT: ret = fill_mon_netdev_rename(skb, device, port_num, net); if (ret) goto err_free; break; default: break; } ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); if (ret) goto err_free; nlmsg_end(skb, nlh); ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); if (ret && ret != -ESRCH) { skb = NULL; /* skb is freed in the netlink send-op handling */ goto err_free; } return 0; err_free: rdma_nl_notify_err_msg(device, port_num, type); nlmsg_free(skb); return ret; } void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); } void nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
209 496 1107 6 359 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } static inline bool timespec64_is_epoch(const struct timespec64 *ts) { return ts->tv_sec == 0 && ts->tv_nsec == 0; } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0-only /* * cfg80211 debugfs * * Copyright 2009 Luis R. Rodriguez <lrodriguez@atheros.com> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2023 Intel Corporation */ #include <linux/slab.h> #include "core.h" #include "debugfs.h" #define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct wiphy *wiphy = file->private_data; \ char buf[buflen]; \ int res; \ \ res = scnprintf(buf, buflen, fmt "\n", ##value); \ return simple_read_from_buffer(userbuf, count, ppos, buf, res); \ } \ \ static const struct file_operations name## _ops = { \ .read = name## _read, \ .open = simple_open, \ .llseek = generic_file_llseek, \ } DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", wiphy->rts_threshold); DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", wiphy->frag_threshold); DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", wiphy->retry_short); DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", wiphy->retry_long); static int ht_print_chan(struct ieee80211_channel *chan, char *buf, int buf_size, int offset) { if (WARN_ON(offset > buf_size)) return 0; if (chan->flags & IEEE80211_CHAN_DISABLED) return scnprintf(buf + offset, buf_size - offset, "%d Disabled\n", chan->center_freq); return scnprintf(buf + offset, buf_size - offset, "%d HT40 %c%c\n", chan->center_freq, (chan->flags & IEEE80211_CHAN_NO_HT40MINUS) ? ' ' : '-', (chan->flags & IEEE80211_CHAN_NO_HT40PLUS) ? ' ' : '+'); } static ssize_t ht40allow_map_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct wiphy *wiphy = file->private_data; char *buf; unsigned int offset = 0, buf_size = PAGE_SIZE, i; enum nl80211_band band; struct ieee80211_supported_band *sband; ssize_t r; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) return -ENOMEM; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) offset += ht_print_chan(&sband->channels[i], buf, buf_size, offset); } r = simple_read_from_buffer(user_buf, count, ppos, buf, offset); kfree(buf); return r; } static const struct file_operations ht40allow_map_ops = { .read = ht40allow_map_read, .open = simple_open, .llseek = default_llseek, }; #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops) void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) { struct dentry *phyd = rdev->wiphy.debugfsdir; DEBUGFS_ADD(rts_threshold); DEBUGFS_ADD(fragmentation_threshold); DEBUGFS_ADD(short_retry_limit); DEBUGFS_ADD(long_retry_limit); DEBUGFS_ADD(ht40allow_map); } struct debugfs_read_work { struct wiphy_work work; ssize_t (*handler)(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data); struct wiphy *wiphy; struct file *file; char *buf; size_t bufsize; void *data; ssize_t ret; struct completion completion; }; static void wiphy_locked_debugfs_read_work(struct wiphy *wiphy, struct wiphy_work *work) { struct debugfs_read_work *w = container_of(work, typeof(*w), work); w->ret = w->handler(w->wiphy, w->file, w->buf, w->bufsize, w->data); complete(&w->completion); } static void wiphy_locked_debugfs_read_cancel(struct dentry *dentry, void *data) { struct debugfs_read_work *w = data; wiphy_work_cancel(w->wiphy, &w->work); complete(&w->completion); } ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, char __user *userbuf, size_t count, loff_t *ppos, ssize_t (*handler)(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, void *data), void *data) { struct debugfs_read_work work = { .handler = handler, .wiphy = wiphy, .file = file, .buf = buf, .bufsize = bufsize, .data = data, .ret = -ENODEV, .completion = COMPLETION_INITIALIZER_ONSTACK(work.completion), }; struct debugfs_cancellation cancellation = { .cancel = wiphy_locked_debugfs_read_cancel, .cancel_data = &work, }; /* don't leak stack data or whatever */ memset(buf, 0, bufsize); wiphy_work_init(&work.work, wiphy_locked_debugfs_read_work); wiphy_work_queue(wiphy, &work.work); debugfs_enter_cancellation(file, &cancellation); wait_for_completion(&work.completion); debugfs_leave_cancellation(file, &cancellation); if (work.ret < 0) return work.ret; if (WARN_ON(work.ret > bufsize)) return -EINVAL; return simple_read_from_buffer(userbuf, count, ppos, buf, work.ret); } EXPORT_SYMBOL_GPL(wiphy_locked_debugfs_read); struct debugfs_write_work { struct wiphy_work work; ssize_t (*handler)(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data); struct wiphy *wiphy; struct file *file; char *buf; size_t count; void *data; ssize_t ret; struct completion completion; }; static void wiphy_locked_debugfs_write_work(struct wiphy *wiphy, struct wiphy_work *work) { struct debugfs_write_work *w = container_of(work, typeof(*w), work); w->ret = w->handler(w->wiphy, w->file, w->buf, w->count, w->data); complete(&w->completion); } static void wiphy_locked_debugfs_write_cancel(struct dentry *dentry, void *data) { struct debugfs_write_work *w = data; wiphy_work_cancel(w->wiphy, &w->work); complete(&w->completion); } ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, char *buf, size_t bufsize, const char __user *userbuf, size_t count, ssize_t (*handler)(struct wiphy *wiphy, struct file *file, char *buf, size_t count, void *data), void *data) { struct debugfs_write_work work = { .handler = handler, .wiphy = wiphy, .file = file, .buf = buf, .count = count, .data = data, .ret = -ENODEV, .completion = COMPLETION_INITIALIZER_ONSTACK(work.completion), }; struct debugfs_cancellation cancellation = { .cancel = wiphy_locked_debugfs_write_cancel, .cancel_data = &work, }; /* mostly used for strings so enforce NUL-termination for safety */ if (count >= bufsize) return -EINVAL; memset(buf, 0, bufsize); if (copy_from_user(buf, userbuf, count)) return -EFAULT; wiphy_work_init(&work.work, wiphy_locked_debugfs_write_work); wiphy_work_queue(wiphy, &work.work); debugfs_enter_cancellation(file, &cancellation); wait_for_completion(&work.completion); debugfs_leave_cancellation(file, &cancellation); return work.ret; } EXPORT_SYMBOL_GPL(wiphy_locked_debugfs_write);
1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 // SPDX-License-Identifier: GPL-2.0-or-later /* net/sched/sch_teql.c "True" (or "trivial") link equalizer. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/moduleparam.h> #include <net/dst.h> #include <net/neighbour.h> #include <net/pkt_sched.h> /* How to setup it. ---------------- After loading this module you will find a new device teqlN and new qdisc with the same name. To join a slave to the equalizer you should just set this qdisc on a device f.e. # tc qdisc add dev eth0 root teql0 # tc qdisc add dev eth1 root teql0 That's all. Full PnP 8) Applicability. -------------- 1. Slave devices MUST be active devices, i.e., they must raise the tbusy signal and generate EOI events. If you want to equalize virtual devices like tunnels, use a normal eql device. 2. This device puts no limitations on physical slave characteristics f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) Certainly, large difference in link speeds will make the resulting eqalized link unusable, because of huge packet reordering. I estimate an upper useful difference as ~10 times. 3. If the slave requires address resolution, only protocols using neighbour cache (IPv4/IPv6) will work over the equalized link. Other protocols are still allowed to use the slave device directly, which will not break load balancing, though native slave traffic will have the highest priority. */ struct teql_master { struct Qdisc_ops qops; struct net_device *dev; struct Qdisc *slaves; struct list_head master_list; unsigned long tx_bytes; unsigned long tx_packets; unsigned long tx_errors; unsigned long tx_dropped; }; struct teql_sched_data { struct Qdisc *next; struct teql_master *m; struct sk_buff_head q; }; #define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next) #define FMASK (IFF_BROADCAST | IFF_POINTOPOINT) /* "teql*" qdisc routines */ static int teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) { __skb_queue_tail(&q->q, skb); return NET_XMIT_SUCCESS; } return qdisc_drop(skb, sch, to_free); } static struct sk_buff * teql_dequeue(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; struct sk_buff *skb; struct Qdisc *q; skb = __skb_dequeue(&dat->q); dat_queue = netdev_get_tx_queue(dat->m->dev, 0); q = rcu_dereference_bh(dat_queue->qdisc); if (skb == NULL) { struct net_device *m = qdisc_dev(q); if (m) { dat->m->slaves = sch; netif_wake_queue(m); } } else { qdisc_bstats_update(sch, skb); } sch->q.qlen = dat->q.qlen + q->q.qlen; return skb; } static struct sk_buff * teql_peek(struct Qdisc *sch) { /* teql is meant to be used as root qdisc */ return NULL; } static void teql_reset(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); skb_queue_purge(&dat->q); } static void teql_destroy(struct Qdisc *sch) { struct Qdisc *q, *prev; struct teql_sched_data *dat = qdisc_priv(sch); struct teql_master *master = dat->m; if (!master) return; prev = master->slaves; if (prev) { do { q = NEXT_SLAVE(prev); if (q == sch) { NEXT_SLAVE(prev) = NEXT_SLAVE(q); if (q == master->slaves) { master->slaves = NEXT_SLAVE(q); if (q == master->slaves) { struct netdev_queue *txq; spinlock_t *root_lock; txq = netdev_get_tx_queue(master->dev, 0); master->slaves = NULL; root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc)); spin_lock_bh(root_lock); qdisc_reset(rtnl_dereference(txq->qdisc)); spin_unlock_bh(root_lock); } } skb_queue_purge(&dat->q); break; } } while ((prev = q) != master->slaves); } } static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct teql_master *m = (struct teql_master *)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); if (dev->hard_header_len > m->dev->hard_header_len) return -EINVAL; if (m->dev == dev) return -ELOOP; q->m = m; skb_queue_head_init(&q->q); if (m->slaves) { if (m->dev->flags & IFF_UP) { if ((m->dev->flags & IFF_POINTOPOINT && !(dev->flags & IFF_POINTOPOINT)) || (m->dev->flags & IFF_BROADCAST && !(dev->flags & IFF_BROADCAST)) || (m->dev->flags & IFF_MULTICAST && !(dev->flags & IFF_MULTICAST)) || dev->mtu < m->dev->mtu) return -EINVAL; } else { if (!(dev->flags&IFF_POINTOPOINT)) m->dev->flags &= ~IFF_POINTOPOINT; if (!(dev->flags&IFF_BROADCAST)) m->dev->flags &= ~IFF_BROADCAST; if (!(dev->flags&IFF_MULTICAST)) m->dev->flags &= ~IFF_MULTICAST; if (dev->mtu < m->dev->mtu) m->dev->mtu = dev->mtu; } q->next = NEXT_SLAVE(m->slaves); NEXT_SLAVE(m->slaves) = sch; } else { q->next = sch; m->slaves = sch; m->dev->mtu = dev->mtu; m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK); } return 0; } static int __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev, struct netdev_queue *txq, struct dst_entry *dst) { struct neighbour *n; int err = 0; n = dst_neigh_lookup_skb(dst, skb); if (!n) return -ENOENT; if (dst->dev != dev) { struct neighbour *mn; mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev); neigh_release(n); if (IS_ERR(mn)) return PTR_ERR(mn); n = mn; } if (neigh_event_send(n, skb_res) == 0) { int err; char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, n, dev); err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), haddr, NULL, skb->len); if (err < 0) err = -EINVAL; } else { err = (skb_res == NULL) ? -EAGAIN : 1; } neigh_release(n); return err; } static inline int teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev, struct netdev_queue *txq) { struct dst_entry *dst = skb_dst(skb); int res; if (rcu_access_pointer(txq->qdisc) == &noop_qdisc) return -ENODEV; if (!dev->header_ops || !dst) return 0; rcu_read_lock(); res = __teql_resolve(skb, skb_res, dev, txq, dst); rcu_read_unlock(); return res; } static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev) { struct teql_master *master = netdev_priv(dev); struct Qdisc *start, *q; int busy; int nores; int subq = skb_get_queue_mapping(skb); struct sk_buff *skb_res = NULL; start = master->slaves; restart: nores = 0; busy = 0; q = start; if (!q) goto drop; do { struct net_device *slave = qdisc_dev(q); struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0); if (rcu_access_pointer(slave_txq->qdisc_sleeping) != q) continue; if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) || !netif_running(slave)) { busy = 1; continue; } switch (teql_resolve(skb, skb_res, slave, slave_txq)) { case 0: if (__netif_tx_trylock(slave_txq)) { unsigned int length = qdisc_pkt_len(skb); if (!netif_xmit_frozen_or_stopped(slave_txq) && netdev_start_xmit(skb, slave, slave_txq, false) == NETDEV_TX_OK) { __netif_tx_unlock(slave_txq); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); master->tx_packets++; master->tx_bytes += length; return NETDEV_TX_OK; } __netif_tx_unlock(slave_txq); } if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0))) busy = 1; break; case 1: master->slaves = NEXT_SLAVE(q); return NETDEV_TX_OK; default: nores = 1; break; } __skb_pull(skb, skb_network_offset(skb)); } while ((q = NEXT_SLAVE(q)) != start); if (nores && skb_res == NULL) { skb_res = skb; goto restart; } if (busy) { netif_stop_queue(dev); return NETDEV_TX_BUSY; } master->tx_errors++; drop: master->tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } static int teql_master_open(struct net_device *dev) { struct Qdisc *q; struct teql_master *m = netdev_priv(dev); int mtu = 0xFFFE; unsigned int flags = IFF_NOARP | IFF_MULTICAST; if (m->slaves == NULL) return -EUNATCH; flags = FMASK; q = m->slaves; do { struct net_device *slave = qdisc_dev(q); if (slave == NULL) return -EUNATCH; if (slave->mtu < mtu) mtu = slave->mtu; if (slave->hard_header_len > LL_MAX_HEADER) return -EINVAL; /* If all the slaves are BROADCAST, master is BROADCAST If all the slaves are PtP, master is PtP Otherwise, master is NBMA. */ if (!(slave->flags&IFF_POINTOPOINT)) flags &= ~IFF_POINTOPOINT; if (!(slave->flags&IFF_BROADCAST)) flags &= ~IFF_BROADCAST; if (!(slave->flags&IFF_MULTICAST)) flags &= ~IFF_MULTICAST; } while ((q = NEXT_SLAVE(q)) != m->slaves); m->dev->mtu = mtu; m->dev->flags = (m->dev->flags&~FMASK) | flags; netif_start_queue(m->dev); return 0; } static int teql_master_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static void teql_master_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct teql_master *m = netdev_priv(dev); stats->tx_packets = m->tx_packets; stats->tx_bytes = m->tx_bytes; stats->tx_errors = m->tx_errors; stats->tx_dropped = m->tx_dropped; } static int teql_master_mtu(struct net_device *dev, int new_mtu) { struct teql_master *m = netdev_priv(dev); struct Qdisc *q; q = m->slaves; if (q) { do { if (new_mtu > qdisc_dev(q)->mtu) return -EINVAL; } while ((q = NEXT_SLAVE(q)) != m->slaves); } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static const struct net_device_ops teql_netdev_ops = { .ndo_open = teql_master_open, .ndo_stop = teql_master_close, .ndo_start_xmit = teql_master_xmit, .ndo_get_stats64 = teql_master_stats64, .ndo_change_mtu = teql_master_mtu, }; static __init void teql_master_setup(struct net_device *dev) { struct teql_master *master = netdev_priv(dev); struct Qdisc_ops *ops = &master->qops; master->dev = dev; ops->priv_size = sizeof(struct teql_sched_data); ops->enqueue = teql_enqueue; ops->dequeue = teql_dequeue; ops->peek = teql_peek; ops->init = teql_qdisc_init; ops->reset = teql_reset; ops->destroy = teql_destroy; ops->owner = THIS_MODULE; dev->netdev_ops = &teql_netdev_ops; dev->type = ARPHRD_VOID; dev->mtu = 1500; dev->min_mtu = 68; dev->max_mtu = 65535; dev->tx_queue_len = 100; dev->flags = IFF_NOARP; dev->hard_header_len = LL_MAX_HEADER; netif_keep_dst(dev); } static LIST_HEAD(master_dev_list); static int max_equalizers = 1; module_param(max_equalizers, int, 0); MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers"); static int __init teql_init(void) { int i; int err = -ENODEV; for (i = 0; i < max_equalizers; i++) { struct net_device *dev; struct teql_master *master; dev = alloc_netdev(sizeof(struct teql_master), "teql%d", NET_NAME_UNKNOWN, teql_master_setup); if (!dev) { err = -ENOMEM; break; } if ((err = register_netdev(dev))) { free_netdev(dev); break; } master = netdev_priv(dev); strscpy(master->qops.id, dev->name, IFNAMSIZ); err = register_qdisc(&master->qops); if (err) { unregister_netdev(dev); free_netdev(dev); break; } list_add_tail(&master->master_list, &master_dev_list); } return i ? 0 : err; } static void __exit teql_exit(void) { struct teql_master *master, *nxt; list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) { list_del(&master->master_list); unregister_qdisc(&master->qops); unregister_netdev(master->dev); free_netdev(master->dev); } } module_init(teql_init); module_exit(teql_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("True (or trivial) link equalizer qdisc");
3779 3 1045 3 2754 5 3799 98 99 64 32 91 90 91 21 91 91 16 16 135 4 3 2 2 25 100 63 69 2570 2571 1193 1808 2680 4 3 3 2 2665 103 2588 803 2332 2577 107 3 73 33 86 20 104 107 2666 23 555 2114 1233 1861 2591 2662 1 5 2 30 32 37 37 1 3 3 18 22 26 26 646 646 643 647 392 232 53 31 55 64 215 5 2 7 10 164 31 175 153 564 1 2 8 15 538 486 56 135 441 526 140 4 62 73 102 50 78 74 132 518 13 377 136 439 94 155 410 508 2 7 2 42 41 85 1 7 2 19 33 57 140 519 90 89 62 58 160 11 2 140 15 2 3 2 148 144 2 1 135 8 58 3 74 139 163 2 16 143 2 173 173 176 3 77 98 174 174 176 2 175 10 2 2 1 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap_prepare = generic_file_readonly_mmap_prepare, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * If the file requires locking via f_pos_lock we know * that mutual exclusion for SEEK_CUR on the same file * is guaranteed. If the file isn't locked, we take * f_lock to protect against f_pos races with other * SEEK_CURs. */ if (file_seek_cur_needs_f_lock(file)) { guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implementation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; retval = inode_lock_killable(inode); if (retval) return retval; switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; CLASS(fd_pos, f)(fd); if (fd_empty(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; CLASS(fd_pos, f)(fd); loff_t offset; if (fd_empty(f)) return -EBADF; if (whence > SEEK_MAX) return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PREAD) return vfs_read(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PWRITE) return vfs_write(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ CLASS(fd, in)(in_fd); if (fd_empty(in)) return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ CLASS(fd, out)(out_fd); if (fd_empty(out)) return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { if (pos >= max) return -EOVERFLOW; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; /* * Make sure return value doesn't overflow in 32bit compat mode. Also * limit the size for all cases except when calling ->copy_file_range(). */ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) len = min_t(size_t, MAX_RW_COUNT, len); file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; ssize_t ret = -EBADF; CLASS(fd, f_in)(fd_in); if (fd_empty(f_in)) return -EBADF; CLASS(fd, f_out)(fd_out); if (fd_empty(f_out)) return -EBADF; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } if (flags != 0) return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return -EINVAL; if (!is_power_of_2(len)) return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; if (!(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0-or-later /* Kernel module to match IPComp parameters for IPv4 and IPv6 * * Copyright (C) 2013 WindRiver * * Author: * Fan Du <fan.du@windriver.com> * * Based on: * net/netfilter/xt_esp.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/netfilter/xt_ipcomp.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fan Du <fan.du@windriver.com>"); MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match"); MODULE_ALIAS("ipt_ipcomp"); MODULE_ALIAS("ip6t_ipcomp"); /* Returns 1 if the spi is matched by the range, 0 otherwise */ static inline bool spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_comp_hdr _comphdr; const struct ip_comp_hdr *chdr; const struct xt_ipcomp *compinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr); if (chdr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ pr_debug("Dropping evil IPComp tinygram.\n"); par->hotdrop = true; return false; } return spi_match(compinfo->spis[0], compinfo->spis[1], ntohs(chdr->cpi), !!(compinfo->invflags & XT_IPCOMP_INV_SPI)); } static int comp_mt_check(const struct xt_mtchk_param *par) { const struct xt_ipcomp *compinfo = par->matchinfo; /* Must specify no unknown invflags */ if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) { pr_info_ratelimited("unknown flags %X\n", compinfo->invflags); return -EINVAL; } return 0; } static struct xt_match comp_mt_reg[] __read_mostly = { { .name = "ipcomp", .family = NFPROTO_IPV4, .match = comp_mt, .matchsize = sizeof(struct xt_ipcomp), .proto = IPPROTO_COMP, .checkentry = comp_mt_check, .me = THIS_MODULE, }, { .name = "ipcomp", .family = NFPROTO_IPV6, .match = comp_mt, .matchsize = sizeof(struct xt_ipcomp), .proto = IPPROTO_COMP, .checkentry = comp_mt_check, .me = THIS_MODULE, }, }; static int __init comp_mt_init(void) { return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); } static void __exit comp_mt_exit(void) { xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg)); } module_init(comp_mt_init); module_exit(comp_mt_exit);
1 6 7 7 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 // SPDX-License-Identifier: GPL-2.0-or-later /* * Syncookies implementation for the Linux kernel * * Copyright (C) 1997 Andi Kleen * Based on ideas by D.J.Bernstein and Eric Schenk. */ #include <linux/tcp.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK * stores TCP options: * * MSB LSB * | 31 ... 6 | 5 | 4 | 3 2 1 0 | * | Timestamp | ECN | SACK | WScale | * * When we receive a valid cookie-ACK, we look at the echoed tsval (if * any) to figure out which TCP options we should use for the rebuilt * connection. * * A WScale setting of '0xf' (which is an invalid scaling value) * means that original syn did not include the TCP window scaling option. */ #define TS_OPT_WSCALE_MASK 0xf #define TS_OPT_SACK BIT(4) #define TS_OPT_ECN BIT(5) /* There is no TS_OPT_TIMESTAMP: * if ACK contains timestamp option, we already know it was * requested/supported by the syn/synack exchange. */ #define TSBITS 6 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); return siphash_4u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, count, &syncookie_secret[c]); } /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be * sent in the syn-ack. * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; ts = (ts_now >> TSBITS) << TSBITS; ts |= options; if (ts > ts_now) ts -= (1UL << TSBITS); if (tcp_rsk(req)->req_usec_ts) return ts * NSEC_PER_USEC; return ts * NSEC_PER_MSEC; } static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. * The output should be: * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). * Where sseq is their sequence number and count increases every * minute by 1. * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } /* * This retrieves the small "data" value from the syncookie. * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * * The count value used to generate the cookie must be less than * MAX_SYNCOOKIE_AGE minutes in the past. * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq) { u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; /* Leaving the data behind */ } /* * MSS Values are chosen based on the 2011 paper * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. * Values .. * .. lower than 536 are rare (< 0.2%) * .. between 537 and 1299 account for less than < 1.5% of observed values * .. in the 1300-1349 range account for about 15 to 20% of observed mss values * .. exceeding 1460 are very rare (< 0.04%) * * 1460 is the single most frequently announced mss value (30 to 46% depending * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { 536, 1300, 1440, /* 1440, 1452: PPPoE */ 1460, }; /* * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v4_init_sequence(iph, th, mssp); } /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; bool own_req; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { reqsk_put(req); return child; } if (inet_csk_reqsk_queue_add(sk, req, child)) return child; bh_unlock_sock(child); sock_put(child); } __reqsk_free(req); return NULL; } EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * * return false if we decode a tcp option that is disabled * on the host. */ bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); return true; } if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); req->num_retrans = 0; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); ireq->ir_mark = inet_request_mark(sk, skb); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; treq->snt_synack = 0; treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = ntohl(th->ack_seq) - 1; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; treq->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) return mptcp_subflow_init_cookie_req(req, sk, skb); #endif return 0; } #if IS_ENABLED(CONFIG_BPF) struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) { struct request_sock *req = inet_reqsk(skb->sk); skb->sk = NULL; skb->destructor = NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); req = NULL; } return req; } EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff) { struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; if (sk_is_mptcp(sk)) req = mptcp_subflow_reqsk_alloc(ops, sk, false); else req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); return NULL; } ireq = inet_rsk(req); treq = tcp_rsk(req); req->mss = mss; req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; ireq->snd_wscale = tcp_opt->snd_wscale; ireq->tstamp_ok = tcp_opt->saw_tstamp; ireq->sack_ok = tcp_opt->sack_ok; ireq->wscale_ok = tcp_opt->wscale_ok; ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); treq->ts_off = tsoff; return req; } EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; struct rtable *rt; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct * window size. We should better make sure that the window size * hasn't changed since we received the original syn, but I see * no easy way to do this. */ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; }
5 166 40 171 328 331 252 331 19 419 39 39 253 254 416 419 414 330 329 181 330 298 59 4 2 43 191 267 270 175 96 18 41 48 300 307 143 138 132 306 297 4 6 305 17 9 282 287 289 289 289 306 308 308 307 10 289 17 36 305 11 285 63 289 289 266 276 20 261 27 282 61 153 247 219 30 307 1 310 308 311 308 311 310 308 10 104 54 144 142 163 2 2 17 21 21 259 1 39 2 294 281 19 300 300 298 196 193 189 22 95 35 213 197 45 193 115 13 56 100 56 48 213 3 11 203 200 35 214 1 33 176 173 178 3 3 2 1 2 3 78 5 73 72 2 2 146 94 3 50 2 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); WRITE_ONCE(pwq->triggered, 1); /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!READ_ONCE(pwq->triggered)) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; mask = select_poll_one(i, wait, in, out, bit, busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (unlikely(n < 0)) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask, filter; if (unlikely(fd < 0)) return 0; CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); pfd->revents = mangle_poll(mask); if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
3 6 6 3 6 9 2 2 5 1 4 2 4 1 1 1 1 1 3 1 10 1 1 2 1 5 1 1 1 3 2 1 3 1 1 1 19 19 4 4 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0 /* XSKMAP used for AF_XDP sockets * Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf.h> #include <linux/filter.h> #include <net/xdp_sock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/btf_ids.h> #include "xsk.h" static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock __rcu **map_entry) { struct xsk_map_node *node; node = bpf_map_kzalloc(&map->map, sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); bpf_map_inc(&map->map); atomic_inc(&map->count); node->map = map; node->map_entry = map_entry; return node; } static void xsk_map_node_free(struct xsk_map_node *node) { struct xsk_map *map = node->map; bpf_map_put(&node->map->map); kfree(node); atomic_dec(&map->count); } static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) { spin_lock_bh(&xs->map_list_lock); list_add_tail(&node->node, &xs->map_list); spin_unlock_bh(&xs->map_list_lock); } static void xsk_map_sock_delete(struct xdp_sock *xs, struct xdp_sock __rcu **map_entry) { struct xsk_map_node *n, *tmp; spin_lock_bh(&xs->map_list_lock); list_for_each_entry_safe(n, tmp, &xs->map_list, node) { if (map_entry == n->map_entry) { list_del(&n->node); xsk_map_node_free(n); } } spin_unlock_bh(&xs->map_list_lock); } static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct xsk_map *m; int numa_node; u64 size; if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) return ERR_PTR(-EINVAL); numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); m = bpf_map_area_alloc(size, numa_node); if (!m) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&m->map, attr); spin_lock_init(&m->lock); return &m->map; } static u64 xsk_map_mem_usage(const struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); return struct_size(m, xsk_map, map->max_entries) + (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node); } static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); synchronize_net(); bpf_map_area_free(m); } static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct xsk_map *m = container_of(map, struct xsk_map, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= m->map.max_entries) { *next = 0; return 0; } if (index == m->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) { struct xsk_map *m = container_of(map, struct xsk_map, map); if (key >= map->max_entries) return NULL; return rcu_dereference_check(m->xsk_map[key], rcu_read_lock_bh_held()); } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { return __xsk_map_lookup_elem(map, *(u32 *)key); } static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; struct xdp_sock *xs, *old_xs; u32 i = *(u32 *)key, fd = *(u32 *)value; struct xsk_map_node *node; struct socket *sock; int err; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= m->map.max_entries)) return -E2BIG; sock = sockfd_lookup(fd, &err); if (!sock) return err; if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return -EOPNOTSUPP; } xs = (struct xdp_sock *)sock->sk; map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); if (IS_ERR(node)) { sockfd_put(sock); return PTR_ERR(node); } spin_lock_bh(&m->lock); old_xs = rcu_dereference_protected(*map_entry, lockdep_is_held(&m->lock)); if (old_xs == xs) { err = 0; goto out; } else if (old_xs && map_flags == BPF_NOEXIST) { err = -EEXIST; goto out; } else if (!old_xs && map_flags == BPF_EXIST) { err = -ENOENT; goto out; } xsk_map_sock_add(xs, node); rcu_assign_pointer(*map_entry, xs); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); sockfd_put(sock); return 0; out: spin_unlock_bh(&m->lock); sockfd_put(sock); xsk_map_node_free(node); return err; } static long xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; struct xdp_sock *old_xs; u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; spin_lock_bh(&m->lock); map_entry = &m->xsk_map[k]; old_xs = unrcu_pointer(xchg(map_entry, NULL)); if (old_xs) xsk_map_sock_delete(old_xs, map_entry); spin_unlock_bh(&m->lock); return 0; } static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); } void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock __rcu **map_entry) { spin_lock_bh(&map->lock); if (rcu_access_pointer(*map_entry) == xs) { rcu_assign_pointer(*map_entry, NULL); xsk_map_sock_delete(xs, map_entry); } spin_unlock_bh(&map->lock); } static bool xsk_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { return meta0->max_entries == meta1->max_entries && bpf_map_meta_equal(meta0, meta1); } BTF_ID_LIST_SINGLE(xsk_map_btf_ids, struct, xsk_map) const struct bpf_map_ops xsk_map_ops = { .map_meta_equal = xsk_map_meta_equal, .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, .map_gen_lookup = xsk_map_gen_lookup, .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = xsk_map_mem_usage, .map_btf_id = &xsk_map_btf_ids[0], .map_redirect = xsk_map_redirect, };
1 7 7 7 13 2 8 7 13 7 8 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * em_canid.c Ematch rule to match CAN frames according to their CAN IDs * * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de> * Copyright: (c) 2011 Czech Technical University in Prague * (c) 2011 Volkswagen Group Research * Authors: Michal Sojka <sojkam1@fel.cvut.cz> * Pavel Pisa <pisa@cmp.felk.cvut.cz> * Rostislav Lisovy <lisovy@gmail.cz> * Funded by: Volkswagen Group Research */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> #include <linux/can.h> #define EM_CAN_RULES_MAX 500 struct canid_match { /* For each SFF CAN ID (11 bit) there is one record in this bitfield */ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); int rules_count; int sff_rules_count; int eff_rules_count; /* * Raw rules copied from netlink message; Used for sending * information to userspace (when 'tc filter show' is invoked) * AND when matching EFF frames */ struct can_filter rules_raw[]; }; /** * em_canid_get_id() - Extracts Can ID out of the sk_buff structure. * @skb: buffer to extract Can ID from */ static canid_t em_canid_get_id(struct sk_buff *skb) { /* CAN ID is stored within the data field */ struct can_frame *cf = (struct can_frame *)skb->data; return cf->can_id; } static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id, u32 can_mask) { int i; /* * Limit can_mask and can_id to SFF range to * protect against write after end of array */ can_mask &= CAN_SFF_MASK; can_id &= can_mask; /* Single frame */ if (can_mask == CAN_SFF_MASK) { set_bit(can_id, cm->match_sff); return; } /* All frames */ if (can_mask == 0) { bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS)); return; } /* * Individual frame filter. * Add record (set bit to 1) for each ID that * conforms particular rule */ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) { if ((i & can_mask) == can_id) set_bit(i, cm->match_sff); } } static inline struct canid_match *em_canid_priv(struct tcf_ematch *m) { return (struct canid_match *)m->data; } static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct canid_match *cm = em_canid_priv(m); canid_t can_id; int match = 0; int i; const struct can_filter *lp; can_id = em_canid_get_id(skb); if (can_id & CAN_EFF_FLAG) { for (i = 0, lp = cm->rules_raw; i < cm->eff_rules_count; i++, lp++) { if (!(((lp->can_id ^ can_id) & lp->can_mask))) { match = 1; break; } } } else { /* SFF */ can_id &= CAN_SFF_MASK; match = (test_bit(can_id, cm->match_sff) ? 1 : 0); } return match; } static int em_canid_change(struct net *net, void *data, int len, struct tcf_ematch *m) { struct can_filter *conf = data; /* Array with rules */ struct canid_match *cm; int i; if (!len) return -EINVAL; if (len % sizeof(struct can_filter)) return -EINVAL; if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX) return -EINVAL; cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL); if (!cm) return -ENOMEM; cm->rules_count = len / sizeof(struct can_filter); /* * We need two for() loops for copying rules into two contiguous * areas in rules_raw to process all eff rules with a simple loop. * NB: The configuration interface supports sff and eff rules. * We do not support filters here that match for the same can_id * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123). * For this (unusual case) two filters have to be specified. The * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id. */ /* Fill rules_raw with EFF rules first */ for (i = 0; i < cm->rules_count; i++) { if (conf[i].can_id & CAN_EFF_FLAG) { memcpy(cm->rules_raw + cm->eff_rules_count, &conf[i], sizeof(struct can_filter)); cm->eff_rules_count++; } } /* append SFF frame rules */ for (i = 0; i < cm->rules_count; i++) { if (!(conf[i].can_id & CAN_EFF_FLAG)) { memcpy(cm->rules_raw + cm->eff_rules_count + cm->sff_rules_count, &conf[i], sizeof(struct can_filter)); cm->sff_rules_count++; em_canid_sff_match_add(cm, conf[i].can_id, conf[i].can_mask); } } m->datalen = sizeof(struct canid_match) + len; m->data = (unsigned long)cm; return 0; } static void em_canid_destroy(struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); kfree(cm); } static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m) { struct canid_match *cm = em_canid_priv(m); /* * When configuring this ematch 'rules_count' is set not to exceed * 'rules_raw' array size */ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count, &cm->rules_raw) < 0) return -EMSGSIZE; return 0; } static struct tcf_ematch_ops em_canid_ops = { .kind = TCF_EM_CANID, .change = em_canid_change, .match = em_canid_match, .destroy = em_canid_destroy, .dump = em_canid_dump, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_canid_ops.link) }; static int __init init_em_canid(void) { return tcf_em_register(&em_canid_ops); } static void __exit exit_em_canid(void) { tcf_em_unregister(&em_canid_ops); } MODULE_DESCRIPTION("ematch classifier to match CAN IDs embedded in skb CAN frames"); MODULE_LICENSE("GPL"); module_init(init_em_canid); module_exit(exit_em_canid); MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
16 16 16 16 16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0-or-later /* ASN.1 Object identifier (OID) registry * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/export.h> #include <linux/oid_registry.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/bug.h> #include <linux/asn1.h> #include "oid_registry_data.c" MODULE_DESCRIPTION("OID Registry"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /** * look_up_OID - Find an OID registration for the specified data * @data: Binary representation of the OID * @datasize: Size of the binary representation */ enum OID look_up_OID(const void *data, size_t datasize) { const unsigned char *octets = data; enum OID oid; unsigned char xhash; unsigned i, j, k, hash; size_t len; /* Hash the OID data */ hash = datasize - 1; for (i = 0; i < datasize; i++) hash += octets[i] * 33; hash = (hash >> 24) ^ (hash >> 16) ^ (hash >> 8) ^ hash; hash &= 0xff; /* Binary search the OID registry. OIDs are stored in ascending order * of hash value then ascending order of size and then in ascending * order of reverse value. */ i = 0; k = OID__NR; while (i < k) { j = (i + k) / 2; xhash = oid_search_table[j].hash; if (xhash > hash) { k = j; continue; } if (xhash < hash) { i = j + 1; continue; } oid = oid_search_table[j].oid; len = oid_index[oid + 1] - oid_index[oid]; if (len > datasize) { k = j; continue; } if (len < datasize) { i = j + 1; continue; } /* Variation is most likely to be at the tail end of the * OID, so do the comparison in reverse. */ while (len > 0) { unsigned char a = oid_data[oid_index[oid] + --len]; unsigned char b = octets[len]; if (a > b) { k = j; goto next; } if (a < b) { i = j + 1; goto next; } } return oid; next: ; } return OID__NR; } EXPORT_SYMBOL_GPL(look_up_OID); /** * parse_OID - Parse an OID from a bytestream * @data: Binary representation of the header + OID * @datasize: Size of the binary representation * @oid: Pointer to oid to return result * * Parse an OID from a bytestream that holds the OID in the format * ASN1_OID | length | oid. The length indicator must equal to datasize - 2. * -EBADMSG is returned if the bytestream is too short. */ int parse_OID(const void *data, size_t datasize, enum OID *oid) { const unsigned char *v = data; /* we need 2 bytes of header and at least 1 byte for oid */ if (datasize < 3 || v[0] != ASN1_OID || v[1] != datasize - 2) return -EBADMSG; *oid = look_up_OID(data + 2, datasize - 2); return 0; } EXPORT_SYMBOL_GPL(parse_OID); /* * sprint_oid - Print an Object Identifier into a buffer * @data: The encoded OID to print * @datasize: The size of the encoded OID * @buffer: The buffer to render into * @bufsize: The size of the buffer * * The OID is rendered into the buffer in "a.b.c.d" format and the number of * bytes is returned. -EBADMSG is returned if the data could not be interpreted * and -ENOBUFS if the buffer was too small. */ int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize) { const unsigned char *v = data, *end = v + datasize; unsigned long num; unsigned char n; size_t ret; int count; if (v >= end) goto bad; n = *v++; ret = count = snprintf(buffer, bufsize, "%u.%u", n / 40, n % 40); if (count >= bufsize) return -ENOBUFS; buffer += count; bufsize -= count; while (v < end) { n = *v++; if (!(n & 0x80)) { num = n; } else { num = n & 0x7f; do { if (v >= end) goto bad; n = *v++; num <<= 7; num |= n & 0x7f; } while (n & 0x80); } ret += count = snprintf(buffer, bufsize, ".%lu", num); if (count >= bufsize) return -ENOBUFS; buffer += count; bufsize -= count; } return ret; bad: snprintf(buffer, bufsize, "(bad)"); return -EBADMSG; } EXPORT_SYMBOL_GPL(sprint_oid);
275 3 309 3 270 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
259 3 250 4 255 253 256 254 101 88 33 102 38 63 33 24 61 85 102 30 71 1 102 286 42 119 43 216 83 84 4 84 7 21 24 59 80 4 27 14 8 61 2 6 2 60 33 33 7 13 53 54 24 21 4 62 3 12 103 104 102 2 41 1 56 21 31 17 14 24 55 88 33 48 7 49 6 6 1 54 1 1 55 18 21 1 21 3 2 2 96 2 35 70 27 7 34 2 6 2 1 15 70 2 65 21 2 60 157 5 150 155 92 93 185 183 1 183 138 9 7 109 75 90 63 64 19 95 1 75 19 196 1 85 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 // SPDX-License-Identifier: GPL-2.0-or-later /* * inet_diag.c Module for monitoring INET transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/time.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_connection_sock.h> #include <net/bpf_sk_storage.h> #include <net/netlink.h> #include <linux/inet.h> #include <linux/stddef.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> static const struct inet_diag_handler __rcu **inet_diag_table; struct inet_diag_entry { const __be32 *saddr; const __be32 *daddr; u16 sport; u16 dport; u16 family; u16 userlocks; u32 ifindex; u32 mark; #ifdef CONFIG_SOCK_CGROUP_DATA u64 cgroup_id; #endif }; static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { const struct inet_diag_handler *handler; if (proto < 0 || proto >= IPPROTO_MAX) return NULL; if (!READ_ONCE(inet_diag_table[proto])) sock_load_diag_module(AF_INET, proto); rcu_read_lock(); handler = rcu_dereference(inet_diag_table[proto]); if (handler && !try_module_get(handler->owner)) handler = NULL; rcu_read_unlock(); return handler; } static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { module_put(handler->owner); } void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { r->idiag_family = READ_ONCE(sk->sk_family); r->id.idiag_sport = htons(READ_ONCE(sk->sk_num)); r->id.idiag_dport = READ_ONCE(sk->sk_dport); r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if); sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr); data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr); } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr); r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr); } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, * hence this needs to be included regardless of socket family. */ if (ext & (1 << (INET_DIAG_TOS - 1))) if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0) goto errout; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { if (ext & (1 << (INET_DIAG_TCLASS - 1))) if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) goto errout; } #endif if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark))) goto errout; if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; #ifdef CONFIG_CGROUP_NET_CLASSID classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. * Classful qdiscs use it as direct reference to class. * For cgroup2 classid is always zero. */ if (!classid) classid = READ_ONCE(sk->sk_priority); if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) goto errout; } #ifdef CONFIG_SOCK_CGROUP_DATA if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), INET_DIAG_PAD)) goto errout; #endif r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk); inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk); if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) goto errout; return 0; errout: return 1; } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, struct nlattr **req_nlas) { struct nlattr *nla; int remaining; nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { int type = nla_type(nla); if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32)) return -EINVAL; if (type < __INET_DIAG_REQ_MAX) req_nlas[type] = nla; } return 0; } static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, const struct inet_diag_dump_data *data) { if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); return req->sdiag_protocol; } #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; u8 icsk_pending; int protocol; cb_data = cb->data; protocol = inet_diag_get_protocol(req, cb_data); /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */ handler = rcu_dereference_protected(inet_diag_table[protocol], 1); DEBUG_NET_WARN_ON_ONCE(!handler); if (!handler) return -ENXIO; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; r->idiag_expires = 0; if (inet_diag_msg_attrs_fill(sk, skb, r, ext, sk_user_ns(NETLINK_CB(cb->skb).sk), net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), .idiag_tmem = sk_wmem_alloc_get(sk), }; if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) goto errout; } if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; /* * RAW sockets might have user-defined protocols assigned, * so report the one supplied on socket creation. */ if (sk->sk_type == SOCK_RAW) { if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) goto errout; } if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; } icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD); if (!attr) goto errout; info = nla_data(attr); } if (ext & (1 << (INET_DIAG_CONG - 1))) { int err = 0; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops) err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name); rcu_read_unlock(); if (err < 0) goto errout; } handler->idiag_get_info(sk, r, info); if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux) if (handler->idiag_get_aux(sk, net_admin, skb) < 0) goto errout; if (sk->sk_state < TCP_TIME_WAIT) { union tcp_cc_info info; size_t sz = 0; int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } /* Keep it at the end for potential retry with a larger skb, * or else do best-effort fitting, which is only done for the * first_nlmsg. */ if (cb_data->bpf_stg_diag) { bool first_nlmsg = ((unsigned char *)nlh == skb->data); unsigned int prev_min_dump_alloc; unsigned int total_nla_size = 0; unsigned int msg_len; int err; msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, INET_DIAG_SK_BPF_STORAGES, &total_nla_size); if (!err) goto out; total_nla_size += msg_len; prev_min_dump_alloc = cb->min_dump_alloc; if (total_nla_size > prev_min_dump_alloc) cb->min_dump_alloc = min_t(u32, total_nla_size, MAX_DUMP_ALLOC_SIZE); if (!first_nlmsg) goto errout; if (cb->min_dump_alloc > prev_min_dump_alloc) /* Retry with pskb_expand_head() with * __GFP_DIRECT_RECLAIM */ goto errout; WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); /* Send what we have for this sk * and move on to the next sk in the following * dump() */ } out: nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; struct inet_diag_dump_data dump_data; int err, protocol; memset(&dump_data, 0, sizeof(dump_data)); err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); if (err) return err; protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); if (!handler) return -ENOENT; if (cmd == SOCK_DIAG_BY_FAMILY) { struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { err = handler->destroy(in_skb, req); } else { err = -EOPNOTSUPP; } inet_diag_unlock_handler(handler); return err; } static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) { int words = bits >> 5; bits &= 0x1f; if (words) { if (memcmp(a1, a2, words << 2)) return 0; } if (bits) { __be32 w1, w2; __be32 mask; w1 = a1[words]; w2 = a2[words]; mask = htonl((0xffffffff) << (32 - bits)); if ((w1 ^ w2) & mask) return 0; } return 1; } static int inet_diag_bc_run(const struct nlattr *_bc, const struct inet_diag_entry *entry) { const void *bc = nla_data(_bc); int len = nla_len(_bc); while (len > 0) { int yes = 1; const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_NOP: break; case INET_DIAG_BC_JMP: yes = 0; break; case INET_DIAG_BC_S_EQ: yes = entry->sport == op[1].no; break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_EQ: yes = entry->dport == op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; case INET_DIAG_BC_D_LE: yes = entry->dport <= op[1].no; break; case INET_DIAG_BC_AUTO: yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); break; case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: { const struct inet_diag_hostcond *cond; const __be32 *addr; cond = (const struct inet_diag_hostcond *)(op + 1); if (cond->port != -1 && cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { yes = 0; break; } if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; if (cond->family != AF_UNSPEC && cond->family != entry->family) { if (entry->family == AF_INET6 && cond->family == AF_INET) { if (addr[0] == 0 && addr[1] == 0 && addr[2] == htonl(0xffff) && bitstring_match(addr + 3, cond->addr, cond->prefix_len)) break; } yes = 0; break; } if (cond->prefix_len == 0) break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; yes = 0; break; } case INET_DIAG_BC_DEV_COND: { u32 ifindex; ifindex = *((const u32 *)(op + 1)); if (ifindex != entry->ifindex) yes = 0; break; } case INET_DIAG_BC_MARK_COND: { struct inet_diag_markcond *cond; cond = (struct inet_diag_markcond *)(op + 1); if ((entry->mark & cond->mask) != cond->mark) yes = 0; break; } #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: { u64 cgroup_id; cgroup_id = get_unaligned((const u64 *)(op + 1)); if (cgroup_id != entry->cgroup_id) yes = 0; break; } #endif } if (yes) { len -= op->yes; bc += op->yes; } else { len -= op->no; bc += op->no; } } return len == 0; } /* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV) */ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (entry->family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else #endif { entry->saddr = &sk->sk_rcv_saddr; entry->daddr = &sk->sk_daddr; } } int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { const struct nlattr *bc = cb_data->inet_diag_nla_bc; const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; entry.family = READ_ONCE(sk->sk_family); entry_fill_addrs(&entry, sk); entry.sport = READ_ONCE(inet->inet_num); entry.dport = ntohs(READ_ONCE(inet->inet_dport)); entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); if (cb_data->userlocks_needed) entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; if (cb_data->mark_needed) { if (sk_fullsock(sk)) entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else if (sk->sk_state == TCP_TIME_WAIT) entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; } #ifdef CONFIG_SOCK_CGROUP_DATA if (cb_data->cgroup_needed) entry.cgroup_id = sk_fullsock(sk) ? cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); } EXPORT_SYMBOL_GPL(inet_diag_bc_sk); static int valid_cc(const void *bc, int len, int cc) { while (len >= 0) { const struct inet_diag_bc_op *op = bc; if (cc > len) return 0; if (cc == len) return 1; if (op->yes < 4 || op->yes & 3) return 0; len -= op->yes; bc += op->yes; } return 0; } /* data is u32 ifindex */ static bool valid_devcond(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Check ifindex space. */ *min_len += sizeof(u32); if (len < *min_len) return false; return true; } /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) { struct inet_diag_hostcond *cond; int addr_len; /* Check hostcond space. */ *min_len += sizeof(struct inet_diag_hostcond); if (len < *min_len) return false; cond = (struct inet_diag_hostcond *)(op + 1); /* Check address family and address length. */ switch (cond->family) { case AF_UNSPEC: addr_len = 0; break; case AF_INET: addr_len = sizeof(struct in_addr); break; case AF_INET6: addr_len = sizeof(struct in6_addr); break; default: return false; } *min_len += addr_len; if (len < *min_len) return false; /* Check prefix length (in bits) vs address length (in bytes). */ if (cond->prefix_len > 8 * addr_len) return false; return true; } /* Validate a port comparison operator. */ static bool valid_port_comparison(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ *min_len += sizeof(struct inet_diag_bc_op); if (len < *min_len) return false; return true; } static bool valid_markcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(struct inet_diag_markcond); return len >= *min_len; } #ifdef CONFIG_SOCK_CGROUP_DATA static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(u64); return len >= *min_len; } #endif static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; bool net_admin; if (!attr) return 0; if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_DEV_COND: if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_MARK_COND: if (!net_admin) return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: cb_data->userlocks_needed = true; fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } if (op->code != INET_DIAG_BC_NOP) { if (op->no < min_len || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) return -EINVAL; } if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; } return len == 0 ? 0 : -EINVAL; } static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; int protocol, err = 0; protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(protocol); if (handler) { handler->dump(skb, cb, r); inet_diag_unlock_handler(handler); } else { err = -ENOENT; } /* The skb is not large enough to fit one sk info and * inet_sk_diag_fill() has requested for a larger skb. */ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); if (!err) goto again; } return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); } static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) { const struct nlmsghdr *nlh = cb->nlh; struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); if (err) { kfree(cb_data); return err; } err = inet_diag_bc_audit(cb_data, skb); if (err) { kfree(cb_data); return err; } nla = cb_data->inet_diag_nla_bpf_stgs; if (nla) { struct bpf_sk_storage_diag *bpf_stg_diag; bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); if (IS_ERR(bpf_stg_diag)) { kfree(cb_data); return PTR_ERR(bpf_stg_diag); } cb_data->bpf_stg_diag = bpf_stg_diag; } cb->data = cb_data; return 0; } static int inet_diag_dump_start(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); } static int inet_diag_dump_start_compat(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); } static int inet_diag_dump_done(struct netlink_callback *cb) { struct inet_diag_dump_data *cb_data = cb->data; bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); kfree(cb->data); return 0; } static int inet_diag_type2proto(int type) { switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; default: return 0; } } static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req_v2 req; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; req.sdiag_family = rc->idiag_family; req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start_compat, .done = inet_diag_dump_done, .dump = inet_diag_dump_compat, }; return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); } static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start, .done = inet_diag_dump_done, .dump = inet_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, nlmsg_data(h)); } static int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) { const struct inet_diag_handler *handler; struct nlmsghdr *nlh; struct nlattr *attr; struct inet_diag_msg *r; void *info = NULL; int err = 0; nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); if (!nlh) return -ENOMEM; r = nlmsg_data(nlh); memset(r, 0, sizeof(*r)); inet_diag_msg_common_fill(r, sk); if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) r->id.idiag_sport = inet_sk(sk)->inet_sport; r->idiag_state = sk->sk_state; if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { nlmsg_cancel(skb, nlh); return err; } handler = inet_diag_lock_handler(sk->sk_protocol); if (!handler) { nlmsg_cancel(skb, nlh); return -ENOENT; } attr = handler->idiag_info_size ? nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD) : NULL; if (attr) info = nla_data(attr); handler->idiag_get_info(sk, r, info); inet_diag_unlock_handler(handler); nlmsg_end(skb, nlh); return 0; } static const struct sock_diag_handler inet_diag_handler = { .owner = THIS_MODULE, .family = AF_INET, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; static const struct sock_diag_handler inet6_diag_handler = { .owner = THIS_MODULE, .family = AF_INET6, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; int inet_diag_register(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return -EINVAL; return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL, h) ? 0 : -EEXIST; } EXPORT_SYMBOL_GPL(inet_diag_register); void inet_diag_unregister(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return; xchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL); } EXPORT_SYMBOL_GPL(inet_diag_unregister); static const struct sock_diag_inet_compat inet_diag_compat = { .owner = THIS_MODULE, .fn = inet_diag_rcv_msg_compat, }; static int __init inet_diag_init(void) { const int inet_diag_table_size = (IPPROTO_MAX * sizeof(struct inet_diag_handler *)); int err = -ENOMEM; inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL); if (!inet_diag_table) goto out; err = sock_diag_register(&inet_diag_handler); if (err) goto out_free_nl; err = sock_diag_register(&inet6_diag_handler); if (err) goto out_free_inet; sock_diag_register_inet_compat(&inet_diag_compat); out: return err; out_free_inet: sock_diag_unregister(&inet_diag_handler); out_free_nl: kfree(inet_diag_table); goto out; } static void __exit inet_diag_exit(void) { sock_diag_unregister(&inet6_diag_handler); sock_diag_unregister(&inet_diag_handler); sock_diag_unregister_inet_compat(&inet_diag_compat); kfree(inet_diag_table); } module_init(inet_diag_init); module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
3221 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H #include <linux/types.h> /* 31 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 31 struct folio; /** * struct folio_batch - A collection of folios. * * The folio_batch is used to amortise the cost of retrieving and * operating on a set of folios. The order of folios in the batch may be * significant (eg delete_from_page_cache_batch()). Some users of the * folio_batch store "exceptional" entries in it which can be removed * by calling folio_batch_remove_exceptionals(). */ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. * * A freshly initialised folio_batch contains zero folios. */ static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; } static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } /** * folio_batch_add() - Add a folio to a batch. * @fbatch: The folio batch. * @folio: The folio to add. * * The folio is added to the end of the batch. * The batch must have previously been initialised using folio_batch_init(). * * Return: The number of slots still available. */ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; return folio_batch_space(fbatch); } /** * folio_batch_next - Return the next folio to process. * @fbatch: The folio batch being processed. * * Use this function to implement a queue of folios. * * Return: The next folio in the queue, or NULL if the queue is empty. */ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) { if (fbatch->i == fbatch->nr) return NULL; return fbatch->folios[fbatch->i++]; } void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { if (folio_batch_count(fbatch)) __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2008 BalaBit IT Ltd. * Author: Krisztian Kovacs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> #include <net/sock.h> #include <net/inet_sock.h> #include <net/netfilter/nf_socket.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif static int extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, __be32 *raddr, __be32 *laddr, __be16 *rport, __be16 *lport) { unsigned int outside_hdrlen = ip_hdrlen(skb); struct iphdr *inside_iph, _inside_iph; struct icmphdr *icmph, _icmph; __be16 *ports, _ports[2]; icmph = skb_header_pointer(skb, outside_hdrlen, sizeof(_icmph), &_icmph); if (icmph == NULL) return 1; if (!icmp_is_err(icmph->type)) return 1; inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), sizeof(_inside_iph), &_inside_iph); if (inside_iph == NULL) return 1; if (inside_iph->protocol != IPPROTO_TCP && inside_iph->protocol != IPPROTO_UDP) return 1; ports = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr) + (inside_iph->ihl << 2), sizeof(_ports), &_ports); if (ports == NULL) return 1; /* the inside IP packet is the one quoted from our side, thus * its saddr is the local address */ *protocol = inside_iph->protocol; *laddr = inside_iph->saddr; *lport = ports[0]; *raddr = inside_iph->daddr; *rport = ports[1]; return 0; } static struct sock * nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); } return NULL; } struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { __be32 daddr, saddr; __be16 dport, sport; const struct iphdr *iph = ip_hdr(skb); struct sk_buff *data_skb = NULL; u8 protocol; #if IS_ENABLED(CONFIG_NF_CONNTRACK) enum ip_conntrack_info ctinfo; struct nf_conn const *ct; #endif int doff = 0; if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { struct tcphdr _hdr; struct udphdr *hp; hp = skb_header_pointer(skb, ip_hdrlen(skb), iph->protocol == IPPROTO_UDP ? sizeof(*hp) : sizeof(_hdr), &_hdr); if (hp == NULL) return NULL; protocol = iph->protocol; saddr = iph->saddr; sport = hp->source; daddr = iph->daddr; dport = hp->dest; data_skb = (struct sk_buff *)skb; doff = iph->protocol == IPPROTO_TCP ? ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, &sport, &dport)) return NULL; } else { return NULL; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Do the lookup with the original socket address in * case this is a reply packet of an established * SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); if (ct && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_ESTABLISHED_REPLY) || (iph->protocol == IPPROTO_ICMP && ctinfo == IP_CT_RELATED_REPLY)) && (ct->status & IPS_SRC_NAT_DONE)) { daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; dport = (iph->protocol == IPPROTO_TCP) ? ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; } #endif return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, daddr, sport, dport, indev); } EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure");
12 12 12 12 12 12 12 12 12 12 12 12 12 12 119 119 119 119 28 29 23 6 6 28 28 29 29 29 29 29 29 29 29 29 29 31 32 33 33 32 33 31 19 13 32 32 13 32 32 32 10 12 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/sort.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/label.h" #include "include/policy.h" #include "include/secid.h" /* * the aa_label represents the set of profiles confining an object * * Labels maintain a reference count to the set of pointers they reference * Labels are ref counted by * tasks and object via the security field/security context off the field * code - will take a ref count on a label if it needs the label * beyond what is possible with an rcu_read_lock. * profiles - each profile is a label * secids - a pinned secid will keep a refcount of the label it is * referencing * objects - inode, files, sockets, ... * * Labels are not ref counted by the label set, so they maybe removed and * freed when no longer in use. * */ #define PROXY_POISON 97 #define LABEL_POISON 100 static void free_proxy(struct aa_proxy *proxy) { if (proxy) { /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } void aa_proxy_kref(struct kref *kref) { struct aa_proxy *proxy = container_of(kref, struct aa_proxy, count); free_proxy(proxy); } struct aa_proxy *aa_alloc_proxy(struct aa_label *label, gfp_t gfp) { struct aa_proxy *new; new = kzalloc(sizeof(struct aa_proxy), gfp); if (new) { kref_init(&new->count); rcu_assign_pointer(new->label, aa_get_label(label)); } return new; } /* requires profile list write lock held */ void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new) { struct aa_label *tmp; AA_BUG(!orig); AA_BUG(!new); lockdep_assert_held_write(&labels_set(orig)->lock); tmp = rcu_dereference_protected(orig->proxy->label, &labels_ns(orig)->lock); rcu_assign_pointer(orig->proxy->label, aa_get_label(new)); orig->flags |= FLAG_STALE; aa_put_label(tmp); } static void __proxy_share(struct aa_label *old, struct aa_label *new) { struct aa_proxy *proxy = new->proxy; new->proxy = aa_get_proxy(old->proxy); __aa_proxy_redirect(old, new); aa_put_proxy(proxy); } /** * ns_cmp - compare ns for label set ordering * @a: ns to compare (NOT NULL) * @b: ns to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int ns_cmp(struct aa_ns *a, struct aa_ns *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b) return 0; res = a->level - b->level; if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * profile_cmp - profile comparison for set ordering * @a: profile to compare (NOT NULL) * @b: profile to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int profile_cmp(struct aa_profile *a, struct aa_profile *b) { int res; AA_BUG(!a); AA_BUG(!b); AA_BUG(!a->ns); AA_BUG(!b->ns); AA_BUG(!a->base.hname); AA_BUG(!b->base.hname); if (a == b || a->base.hname == b->base.hname) return 0; res = ns_cmp(a->ns, b->ns); if (res) return res; return strcmp(a->base.hname, b->base.hname); } /** * vec_cmp - label comparison for set ordering * @a: aa_profile to compare (NOT NULL) * @an: length of @a * @b: aa_profile to compare (NOT NULL) * @bn: length of @b * * Returns: <0 if @a < @b * ==0 if @a == @b * >0 if @a > @b */ static int vec_cmp(struct aa_profile **a, int an, struct aa_profile **b, int bn) { int i; AA_BUG(!a); AA_BUG(!*a); AA_BUG(!b); AA_BUG(!*b); AA_BUG(an <= 0); AA_BUG(bn <= 0); for (i = 0; i < an && i < bn; i++) { int res = profile_cmp(a[i], b[i]); if (res != 0) return res; } return an - bn; } static bool vec_is_stale(struct aa_profile **vec, int n) { int i; AA_BUG(!vec); for (i = 0; i < n; i++) { if (profile_is_stale(vec[i])) return true; } return false; } static void accum_label_info(struct aa_label *new) { long u = FLAG_UNCONFINED; int i; AA_BUG(!new); /* size == 1 is a profile and flags must be set as part of creation */ if (new->size == 1) return; for (i = 0; i < new->size; i++) { u |= new->vec[i]->label.flags & (FLAG_DEBUG1 | FLAG_DEBUG2 | FLAG_STALE); if (!(u & new->vec[i]->label.flags & FLAG_UNCONFINED)) u &= ~FLAG_UNCONFINED; new->mediates |= new->vec[i]->label.mediates; } new->flags |= u; } static int sort_cmp(const void *a, const void *b) { return profile_cmp(*(struct aa_profile **)a, *(struct aa_profile **)b); } /* * assumes vec is sorted * Assumes @vec has null terminator at vec[n], and will null terminate * vec[n - dups] */ static inline int unique(struct aa_profile **vec, int n) { int i, pos, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); pos = 0; for (i = 1; i < n; i++) { int res = profile_cmp(vec[pos], vec[i]); AA_BUG(res > 0, "vec not sorted"); if (res == 0) { /* drop duplicate */ aa_put_profile(vec[i]); dups++; continue; } pos++; if (dups) vec[pos] = vec[i]; } AA_BUG(dups < 0); return dups; } /** * aa_vec_unique - canonical sort and unique a list of profiles * @n: number of refcounted profiles in the list (@n > 0) * @vec: list of profiles to sort and merge * @flags: null terminator flags of @vec * * Returns: the number of duplicates eliminated == references put * * If @flags & VEC_FLAG_TERMINATE @vec has null terminator at vec[n], and will * null terminate vec[n - dups] */ int aa_vec_unique(struct aa_profile **vec, int n, int flags) { int i, dups = 0; AA_BUG(n < 1); AA_BUG(!vec); /* vecs are usually small and inorder, have a fallback for larger */ if (n > 8) { sort(vec, n, sizeof(struct aa_profile *), sort_cmp, NULL); dups = unique(vec, n); goto out; } /* insertion sort + unique in one */ for (i = 1; i < n; i++) { struct aa_profile *tmp = vec[i]; int pos, j; for (pos = i - 1 - dups; pos >= 0; pos--) { int res = profile_cmp(vec[pos], tmp); if (res == 0) { /* drop duplicate entry */ aa_put_profile(tmp); dups++; goto continue_outer; } else if (res < 0) break; } /* pos is at entry < tmp, or index -1. Set to insert pos */ pos++; for (j = i - dups; j > pos; j--) vec[j] = vec[j - 1]; vec[pos] = tmp; continue_outer: ; } AA_BUG(dups < 0); out: if (flags & VEC_FLAG_TERMINATE) vec[n - dups] = NULL; return dups; } void aa_label_destroy(struct aa_label *label) { AA_BUG(!label); if (!label_isprofile(label)) { struct aa_profile *profile; struct label_it i; aa_put_str(label->hname); label_for_each(i, label, profile) { aa_put_profile(profile); label->vec[i.i] = (struct aa_profile *) (LABEL_POISON + (long) i.i); } } if (label->proxy) { if (rcu_dereference_protected(label->proxy->label, true) == label) rcu_assign_pointer(label->proxy->label, NULL); aa_put_proxy(label->proxy); } aa_free_secid(label->secid); label->proxy = (struct aa_proxy *) PROXY_POISON + 1; } void aa_label_free(struct aa_label *label) { if (!label) return; aa_label_destroy(label); kfree(label); } static void label_free_switch(struct aa_label *label) { if (label->flags & FLAG_NS_COUNT) aa_free_ns(labels_ns(label)); else if (label_isprofile(label)) aa_free_profile(labels_profile(label)); else aa_label_free(label); } static void label_free_rcu(struct rcu_head *head) { struct aa_label *label = container_of(head, struct aa_label, rcu); if (label->flags & FLAG_IN_TREE) (void) aa_label_remove(label); label_free_switch(label); } void aa_label_kref(struct kref *kref) { struct aa_label *label = container_of(kref, struct aa_label, count); struct aa_ns *ns = labels_ns(label); if (!ns) { /* never live, no rcu callback needed, just using the fn */ label_free_switch(label); return; } /* TODO: update labels_profile macro so it works here */ AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.profiles)); AA_BUG(label_isprofile(label) && on_list_rcu(&label->vec[0]->base.list)); /* TODO: if compound label and not stale add to reclaim cache */ call_rcu(&label->rcu, label_free_rcu); } static void label_free_or_put_new(struct aa_label *label, struct aa_label *new) { if (label != new) /* need to free directly to break circular ref with proxy */ aa_label_free(new); else aa_put_label(new); } bool aa_label_init(struct aa_label *label, int size, gfp_t gfp) { AA_BUG(!label); AA_BUG(size < 1); if (aa_alloc_secid(label, gfp) < 0) return false; label->size = size; /* doesn't include null */ label->vec[size] = NULL; /* null terminate */ kref_init(&label->count); RB_CLEAR_NODE(&label->node); return true; } /** * aa_label_alloc - allocate a label with a profile vector of @size length * @size: size of profile vector in the label * @proxy: proxy to use OR null if to allocate a new one * @gfp: memory allocation type * * Returns: new label * else NULL if failed */ struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp) { struct aa_label *new; AA_BUG(size < 1); /* + 1 for null terminator entry on vec */ new = kzalloc(struct_size(new, vec, size + 1), gfp); AA_DEBUG(DEBUG_LABEL, "%s (%p)\n", __func__, new); if (!new) goto fail; if (!aa_label_init(new, size, gfp)) goto fail; if (!proxy) { proxy = aa_alloc_proxy(new, gfp); if (!proxy) goto fail; } else aa_get_proxy(proxy); /* just set new's proxy, don't redirect proxy here if it was passed in*/ new->proxy = proxy; return new; fail: kfree(new); return NULL; } /** * label_cmp - label comparison for set ordering * @a: label to compare (NOT NULL) * @b: label to compare (NOT NULL) * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_cmp(struct aa_label *a, struct aa_label *b) { AA_BUG(!b); if (a == b) return 0; return vec_cmp(a->vec, a->size, b->vec, b->size); } /* helper fn for label_for_each_confined */ int aa_label_next_confined(struct aa_label *label, int i) { AA_BUG(!label); AA_BUG(i < 0); for (; i < label->size; i++) { if (!profile_unconfined(label->vec[i])) return i; } return i; } /** * __aa_label_next_not_in_set - return the next profile of @sub not in @set * @I: label iterator * @set: label to test against * @sub: label to if is subset of @set * * Returns: profile in @sub that is not in @set, with iterator set pos after * else NULL if @sub is a subset of @set */ struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub) { AA_BUG(!set); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > set->size); AA_BUG(!sub); AA_BUG(I->j < 0); AA_BUG(I->j > sub->size); while (I->j < sub->size && I->i < set->size) { int res = profile_cmp(sub->vec[I->j], set->vec[I->i]); if (res == 0) { (I->j)++; (I->i)++; } else if (res > 0) (I->i)++; else return sub->vec[(I->j)++]; } if (I->j < sub->size) return sub->vec[(I->j)++]; return NULL; } /** * aa_label_is_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * Returns: true if @sub is subset of @set * else false */ bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; return __aa_label_next_not_in_set(&i, set, sub) == NULL; } /** * aa_label_is_unconfined_subset - test if @sub is a subset of @set * @set: label to test against * @sub: label to test if is subset of @set * * This checks for subset but taking into account unconfined. IF * @sub contains an unconfined profile that does not have a matching * unconfined in @set then this will not cause the test to fail. * Conversely we don't care about an unconfined in @set that is not in * @sub * * Returns: true if @sub is special_subset of @set * else false */ bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub) { struct label_it i = { }; struct aa_profile *p; AA_BUG(!set); AA_BUG(!sub); if (sub == set) return true; do { p = __aa_label_next_not_in_set(&i, set, sub); if (p && !profile_unconfined(p)) break; } while (p); return p == NULL; } /** * __label_remove - remove @label from the label set * @label: label to remove * @new: label to redirect to * * Requires: labels_set(@label)->lock write_lock * Returns: true if the label was in the tree and removed */ static bool __label_remove(struct aa_label *label, struct aa_label *new) { struct aa_labelset *ls = labels_set(label); AA_BUG(!ls); AA_BUG(!label); lockdep_assert_held_write(&ls->lock); if (new) __aa_proxy_redirect(label, new); if (!label_is_stale(label)) __label_make_stale(label); if (label->flags & FLAG_IN_TREE) { rb_erase(&label->node, &ls->root); label->flags &= ~FLAG_IN_TREE; return true; } return false; } /** * __label_replace - replace @old with @new in label set * @old: label to remove from label set * @new: label to replace @old with * * Requires: labels_set(@old)->lock write_lock * valid ref count be held on @new * Returns: true if @old was in set and replaced by @new * * Note: current implementation requires label set be order in such a way * that @new directly replaces @old position in the set (ie. * using pointer comparison of the label address would not work) */ static bool __label_replace(struct aa_label *old, struct aa_label *new) { struct aa_labelset *ls = labels_set(old); AA_BUG(!ls); AA_BUG(!old); AA_BUG(!new); lockdep_assert_held_write(&ls->lock); AA_BUG(new->flags & FLAG_IN_TREE); if (!label_is_stale(old)) __label_make_stale(old); if (old->flags & FLAG_IN_TREE) { rb_replace_node(&old->node, &new->node, &ls->root); old->flags &= ~FLAG_IN_TREE; new->flags |= FLAG_IN_TREE; accum_label_info(new); return true; } return false; } /** * __label_insert - attempt to insert @l into a label set * @ls: set of labels to insert @l into (NOT NULL) * @label: new label to insert (NOT NULL) * @replace: whether insertion should replace existing entry that is not stale * * Requires: @ls->lock * caller to hold a valid ref on l * if @replace is true l has a preallocated proxy associated * Returns: @l if successful in inserting @l - with additional refcount * else ref counted equivalent label that is already in the set, * the else condition only happens if @replace is false */ static struct aa_label *__label_insert(struct aa_labelset *ls, struct aa_label *label, bool replace) { struct rb_node **new, *parent = NULL; AA_BUG(!ls); AA_BUG(!label); AA_BUG(labels_set(label) != ls); lockdep_assert_held_write(&ls->lock); AA_BUG(label->flags & FLAG_IN_TREE); /* Figure out where to put new node */ new = &ls->root.rb_node; while (*new) { struct aa_label *this = rb_entry(*new, struct aa_label, node); int result = label_cmp(label, this); parent = *new; if (result == 0) { /* !__aa_get_label means queued for destruction, * so replace in place, however the label has * died before the replacement so do not share * the proxy */ if (!replace && !label_is_stale(this)) { if (__aa_get_label(this)) return this; } else __proxy_share(this, label); AA_BUG(!__label_replace(this, label)); return aa_get_label(label); } else if (result < 0) new = &((*new)->rb_left); else /* (result > 0) */ new = &((*new)->rb_right); } /* Add new node and rebalance tree. */ rb_link_node(&label->node, parent, new); rb_insert_color(&label->node, &ls->root); label->flags |= FLAG_IN_TREE; accum_label_info(label); return aa_get_label(label); } /** * __vec_find - find label that matches @vec in label set * @vec: vec of profiles to find matching label for (NOT NULL) * @n: length of @vec * * Requires: @vec_labelset(vec) lock held * caller to hold a valid ref on l * * Returns: ref counted @label if matching label is in tree * ref counted label that is equiv to @l in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *__vec_find(struct aa_profile **vec, int n) { struct rb_node *node; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); node = vec_labelset(vec, n)->root.rb_node; while (node) { struct aa_label *this = rb_entry(node, struct aa_label, node); int result = vec_cmp(this->vec, this->size, vec, n); if (result > 0) node = node->rb_left; else if (result < 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * __label_find - find label @label in label set * @label: label to find (NOT NULL) * * Requires: labels_set(@label)->lock held * caller to hold a valid ref on l * * Returns: ref counted @label if @label is in tree OR * ref counted label that is equiv to @label in tree * else NULL if @label or equiv is not in tree */ static struct aa_label *__label_find(struct aa_label *label) { AA_BUG(!label); return __vec_find(label->vec, label->size); } /** * aa_label_remove - remove a label from the labelset * @label: label to remove * * Returns: true if @label was removed from the tree * else @label was not in tree so it could not be removed */ bool aa_label_remove(struct aa_label *label) { struct aa_labelset *ls = labels_set(label); unsigned long flags; bool res; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); res = __label_remove(label, ns_unconfined(labels_ns(label))); write_unlock_irqrestore(&ls->lock, flags); return res; } /** * aa_label_replace - replace a label @old with a new version @new * @old: label to replace * @new: label replacing @old * * Returns: true if @old was in tree and replaced * else @old was not in tree, and @new was not inserted */ bool aa_label_replace(struct aa_label *old, struct aa_label *new) { unsigned long flags; bool res; if (name_is_shared(old, new) && labels_ns(old) == labels_ns(new)) { write_lock_irqsave(&labels_set(old)->lock, flags); if (old->proxy != new->proxy) __proxy_share(old, new); else __aa_proxy_redirect(old, new); res = __label_replace(old, new); write_unlock_irqrestore(&labels_set(old)->lock, flags); } else { struct aa_label *l; struct aa_labelset *ls = labels_set(old); write_lock_irqsave(&ls->lock, flags); res = __label_remove(old, new); if (labels_ns(old) != labels_ns(new)) { write_unlock_irqrestore(&ls->lock, flags); ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); } l = __label_insert(ls, new, true); res = (l == new); write_unlock_irqrestore(&ls->lock, flags); aa_put_label(l); } return res; } /** * vec_find - find label @l in label set * @vec: array of profiles to find equiv label for (NOT NULL) * @n: length of @vec * * Returns: refcounted label if @vec equiv is in tree * else NULL if @vec equiv is not in tree */ static struct aa_label *vec_find(struct aa_profile **vec, int n) { struct aa_labelset *ls; struct aa_label *label; unsigned long flags; AA_BUG(!vec); AA_BUG(!*vec); AA_BUG(n <= 0); ls = vec_labelset(vec, n); read_lock_irqsave(&ls->lock, flags); label = __vec_find(vec, n); read_unlock_irqrestore(&ls->lock, flags); return label; } /* requires sort and merge done first */ static struct aa_label *vec_create_and_insert_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = NULL; struct aa_labelset *ls; unsigned long flags; struct aa_label *new; int i; AA_BUG(!vec); if (len == 1) return aa_get_label(&vec[0]->label); ls = labels_set(&vec[len - 1]->label); /* TODO: enable when read side is lockless * check if label exists before taking locks */ new = aa_label_alloc(len, NULL, gfp); if (!new) return NULL; for (i = 0; i < len; i++) new->vec[i] = aa_get_profile(vec[i]); write_lock_irqsave(&ls->lock, flags); label = __label_insert(ls, new, false); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(label, new); return label; } struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp) { struct aa_label *label = vec_find(vec, len); if (label) return label; return vec_create_and_insert_label(vec, len, gfp); } /** * aa_label_insert - insert label @label into @ls or return existing label * @ls: labelset to insert @label into * @label: label to insert * * Requires: caller to hold a valid ref on @label * * Returns: ref counted @label if successful in inserting @label * else ref counted equivalent label that is already in the set */ struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *label) { struct aa_label *l; unsigned long flags; AA_BUG(!ls); AA_BUG(!label); /* check if label exists before taking lock */ if (!label_is_stale(label)) { read_lock_irqsave(&ls->lock, flags); l = __label_find(label); read_unlock_irqrestore(&ls->lock, flags); if (l) return l; } write_lock_irqsave(&ls->lock, flags); l = __label_insert(ls, label, false); write_unlock_irqrestore(&ls->lock, flags); return l; } /** * aa_label_next_in_merge - find the next profile when merging @a and @b * @I: label iterator * @a: label to merge * @b: label to merge * * Returns: next profile * else null if no more profiles */ struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b) { AA_BUG(!a); AA_BUG(!b); AA_BUG(!I); AA_BUG(I->i < 0); AA_BUG(I->i > a->size); AA_BUG(I->j < 0); AA_BUG(I->j > b->size); if (I->i < a->size) { if (I->j < b->size) { int res = profile_cmp(a->vec[I->i], b->vec[I->j]); if (res > 0) return b->vec[(I->j)++]; if (res == 0) (I->j)++; } return a->vec[(I->i)++]; } if (I->j < b->size) return b->vec[(I->j)++]; return NULL; } /** * label_merge_cmp - cmp of @a merging with @b against @z for set ordering * @a: label to merge then compare (NOT NULL) * @b: label to merge then compare (NOT NULL) * @z: label to compare merge against (NOT NULL) * * Assumes: using the most recent versions of @a, @b, and @z * * Returns: <0 if a < b * ==0 if a == b * >0 if a > b */ static int label_merge_cmp(struct aa_label *a, struct aa_label *b, struct aa_label *z) { struct aa_profile *p = NULL; struct label_it i = { }; int k; AA_BUG(!a); AA_BUG(!b); AA_BUG(!z); for (k = 0; k < z->size && (p = aa_label_next_in_merge(&i, a, b)); k++) { int res = profile_cmp(p, z->vec[k]); if (res != 0) return res; } if (p) return 1; else if (k < z->size) return -1; return 0; } /** * label_merge_insert - create a new label by merging @a and @b * @new: preallocated label to merge into (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: preallocated proxy * * Returns: ref counted label either @new if merge is unique * @a if @b is a subset of @a * @b if @a is a subset of @b * * NOTE: will not use @new if the merge results in @new == @a or @b * * Must be used within labelset write lock to avoid racing with * setting labels stale. */ static struct aa_label *label_merge_insert(struct aa_label *new, struct aa_label *a, struct aa_label *b) { struct aa_label *label; struct aa_labelset *ls; struct aa_profile *next; struct label_it i; unsigned long flags; int k = 0, invcount = 0; bool stale = false; AA_BUG(!a); AA_BUG(a->size < 0); AA_BUG(!b); AA_BUG(b->size < 0); AA_BUG(!new); AA_BUG(new->size < a->size + b->size); label_for_each_in_merge(i, a, b, next) { AA_BUG(!next); if (profile_is_stale(next)) { new->vec[k] = aa_get_newest_profile(next); AA_BUG(!new->vec[k]->label.proxy); AA_BUG(!new->vec[k]->label.proxy->label); if (next->label.proxy != new->vec[k]->label.proxy) invcount++; k++; stale = true; } else new->vec[k++] = aa_get_profile(next); } /* set to actual size which is <= allocated len */ new->size = k; new->vec[k] = NULL; if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { label = aa_get_label(&new->vec[0]->label); return label; } } else if (!stale) { /* * merge could be same as a || b, note: it is not possible * for new->size == a->size == b->size unless a == b */ if (k == a->size) return aa_get_label(a); else if (k == b->size) return aa_get_label(b); } ls = labels_set(new); write_lock_irqsave(&ls->lock, flags); label = __label_insert(labels_set(new), new, false); write_unlock_irqrestore(&ls->lock, flags); return label; } /** * labelset_of_merge - find which labelset a merged label should be inserted * @a: label to merge and insert * @b: label to merge and insert * * Returns: labelset that the merged label should be inserted into */ static struct aa_labelset *labelset_of_merge(struct aa_label *a, struct aa_label *b) { struct aa_ns *nsa = labels_ns(a); struct aa_ns *nsb = labels_ns(b); if (ns_cmp(nsa, nsb) <= 0) return &nsa->labels; return &nsb->labels; } /** * __label_find_merge - find label that is equiv to merge of @a and @b * @ls: set of labels to search (NOT NULL) * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: ls->lock read_lock held * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ static struct aa_label *__label_find_merge(struct aa_labelset *ls, struct aa_label *a, struct aa_label *b) { struct rb_node *node; AA_BUG(!ls); AA_BUG(!a); AA_BUG(!b); if (a == b) return __label_find(a); node = ls->root.rb_node; while (node) { struct aa_label *this = container_of(node, struct aa_label, node); int result = label_merge_cmp(a, b, this); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return __aa_get_label(this); } return NULL; } /** * aa_label_find_merge - find label that is equiv to merge of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * * Requires: labels be fully constructed with a valid ns * * Returns: ref counted label that is equiv to merge of @a and @b * else NULL if merge of @a and @b is not in set */ struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b) { struct aa_labelset *ls; struct aa_label *label, *ar = NULL, *br = NULL; unsigned long flags; AA_BUG(!a); AA_BUG(!b); if (label_is_stale(a)) a = ar = aa_get_newest_label(a); if (label_is_stale(b)) b = br = aa_get_newest_label(b); ls = labelset_of_merge(a, b); read_lock_irqsave(&ls->lock, flags); label = __label_find_merge(ls, a, b); read_unlock_irqrestore(&ls->lock, flags); aa_put_label(ar); aa_put_label(br); return label; } /** * aa_label_merge - attempt to insert new merged label of @a and @b * @a: label to merge with @b (NOT NULL) * @b: label to merge with @a (NOT NULL) * @gfp: memory allocation type * * Requires: caller to hold valid refs on @a and @b * labels be fully constructed with a valid ns * * Returns: ref counted new label if successful in inserting merge of a & b * else ref counted equivalent label that is already in the set. * else NULL if could not create label (-ENOMEM) */ struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp) { struct aa_label *label = NULL; AA_BUG(!a); AA_BUG(!b); if (a == b) return aa_get_newest_label(a); /* TODO: enable when read side is lockless * check if label exists before taking locks if (!label_is_stale(a) && !label_is_stale(b)) label = aa_label_find_merge(a, b); */ if (!label) { struct aa_label *new; a = aa_get_newest_label(a); b = aa_get_newest_label(b); /* could use label_merge_len(a, b), but requires double * comparison for small savings */ new = aa_label_alloc(a->size + b->size, NULL, gfp); if (!new) goto out; label = label_merge_insert(new, a, b); label_free_or_put_new(label, new); out: aa_put_label(a); aa_put_label(b); } return label; } /* match a profile and its associated ns component if needed * Assumes visibility test has already been done. * If a subns profile is not to be matched should be prescreened with * visibility test. */ static inline aa_state_t match_component(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_profile *tp, aa_state_t state) { const char *ns_name; if (profile->ns == tp->ns) return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); /* try matching with namespace name and then profile */ ns_name = aa_ns_name(profile->ns, tp->ns, true); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); state = aa_dfa_match(rules->policy->dfa, state, ns_name); state = aa_dfa_match_len(rules->policy->dfa, state, ":", 1); return aa_dfa_match(rules->policy->dfa, state, tp->base.hname); } /** * label_compound_match - find perms for full compound label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @state: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: perms struct to set * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for A//&B//&C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_compound_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; /* find first subcomponent that is visible */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, state); if (!state) goto fail; goto next; } /* no component visible */ *perms = allperms; return 0; next: label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = aa_dfa_match(rules->policy->dfa, state, "//&"); state = match_component(profile, rules, tp, state); if (!state) goto fail; } *perms = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, perms); if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return state; } /** * label_components_match - find perms for all subcomponents of a label * @profile: profile to find perms for * @rules: ruleset to search * @label: label to check access permissions for * @start: state to start match in * @subns: whether to do permission checks on components in a subns * @request: permissions to request * @perms: an initialized perms struct to add accumulation to * * Returns: 0 on success else ERROR * * For the label A//&B//&C this does the perm match for each of A and B and C * @perms should be preinitialized with allperms OR a previous permission * check to be stacked. */ static int label_components_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t start, bool subns, u32 request, struct aa_perms *perms) { struct aa_profile *tp; struct label_it i; struct aa_perms tmp; aa_state_t state = 0; /* find first subcomponent to test */ label_for_each(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; goto next; } /* no subcomponents visible - no change in perms */ return 0; next: tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); label_for_each_cont(i, label, tp) { if (!aa_ns_visible(profile->ns, tp->ns, subns)) continue; state = match_component(profile, rules, tp, start); if (!state) goto fail; tmp = *aa_lookup_perms(rules->policy, state); aa_apply_modes_to_perms(profile, &tmp); aa_perms_accum(perms, &tmp); } if ((perms->allow & request) != request) return -EACCES; return 0; fail: *perms = nullperms; return -EACCES; } /** * aa_label_match - do a multi-component label match * @profile: profile to match against (NOT NULL) * @rules: ruleset to search * @label: label to match (NOT NULL) * @state: state to start in * @subns: whether to match subns components * @request: permission request * @perms: Returns computed perms (NOT NULL) * * Returns: the state the match finished in, may be the none matching state */ int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms) { int error = label_compound_match(profile, rules, label, state, subns, request, perms); if (!error) return error; *perms = allperms; return label_components_match(profile, rules, label, state, subns, request, perms); } /** * aa_update_label_name - update a label to have a stored name * @ns: ns being viewed from (NOT NULL) * @label: label to update (NOT NULL) * @gfp: type of memory allocation * * Requires: labels_set(label) not locked in caller * * note: only updates the label name if it does not have a name already * and if it is in the labelset */ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) { struct aa_labelset *ls; unsigned long flags; char __counted *name; bool res = false; AA_BUG(!ns); AA_BUG(!label); if (label->hname || labels_ns(label) != ns) return res; if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res; ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); if (!label->hname && label->flags & FLAG_IN_TREE) { label->hname = name; res = true; } else aa_put_str(name); write_unlock_irqrestore(&ls->lock, flags); return res; } /* * cached label name is present and visible * @label->hname only exists if label is namespace hierarchical */ static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, int flags) { if (label->hname && (!ns || labels_ns(label) == ns) && !(flags & ~FLAG_SHOW_MODE)) return true; return false; } /* helper macro for snprint routines */ #define update_for_len(total, len, size, str) \ do { \ size_t ulen = len; \ \ AA_BUG(len < 0); \ total += ulen; \ ulen = min(ulen, size); \ size -= ulen; \ str += ulen; \ } while (0) /** * aa_profile_snxprint - print a profile name to a buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @view: namespace profile is being viewed from * @profile: profile to view (NOT NULL) * @flags: whether to include the mode string * @prev_ns: last ns printed when used in compound print * * Returns: size of name written or would be written if larger than * available buffer * * Note: will not print anything if the profile is not visible */ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_profile *profile, int flags, struct aa_ns **prev_ns) { const char *ns_name = NULL; AA_BUG(!str && size != 0); AA_BUG(!profile); if (!view) view = profiles_ns(profile); if (view != profile->ns && (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, flags & FLAG_VIEW_SUBNS); if (ns_name == aa_hidden_ns_name) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", ns_name); } } if ((flags & FLAG_SHOW_MODE) && profile != profile->ns->unconfined) { const char *modestr = aa_profile_mode_names[profile->mode]; if (ns_name) return snprintf(str, size, ":%s:%s (%s)", ns_name, profile->base.hname, modestr); return snprintf(str, size, "%s (%s)", profile->base.hname, modestr); } if (ns_name) return snprintf(str, size, ":%s:%s", ns_name, profile->base.hname); return snprintf(str, size, "%s", profile->base.hname); } static const char *label_modename(struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct label_it i; int mode = -1, count = 0; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { count++; if (profile == profile->ns->unconfined) /* special case unconfined so stacks with * unconfined don't report as mixed. ie. * profile_foo//&:ns1:unconfined (mixed) */ continue; if (mode == -1) mode = profile->mode; else if (mode != profile->mode) return "mixed"; } } if (count == 0) return "-"; if (mode == -1) /* everything was unconfined */ mode = APPARMOR_UNCONFINED; return aa_profile_mode_names[mode]; } /* if any visible label is not unconfined the display_mode returns true */ static inline bool display_mode(struct aa_ns *ns, struct aa_label *label, int flags) { if ((flags & FLAG_SHOW_MODE)) { struct aa_profile *profile; struct label_it i; label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS) && profile != profile->ns->unconfined) return true; } /* only ns->unconfined in set of profiles in ns */ return false; } return false; } /** * aa_label_snxprint - print a label name to a string buffer * @str: buffer to write to. (MAY BE NULL if @size == 0) * @size: size of buffer * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: whether to include the mode string * * Returns: size of name written or would be written if larger than * available buffer * * Note: labels do not have to be strictly hierarchical to the ns as * objects may be shared across different namespaces and thus * pickup labeling from each ns. If a particular part of the * label is not visible it will just be excluded. And if none * of the label is visible "---" will be used. */ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, struct aa_label *label, int flags) { struct aa_profile *profile; struct aa_ns *prev_ns = NULL; struct label_it i; int count = 0, total = 0; ssize_t len; AA_BUG(!str && size != 0); AA_BUG(!label); if (DEBUG_ABS_ROOT && (flags & FLAG_ABS_ROOT)) { ns = root_ns; len = snprintf(str, size, "_"); update_for_len(total, len, size, str); } else if (!ns) { ns = labels_ns(label); } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { if (count > 0) { len = snprintf(str, size, "//&"); update_for_len(total, len, size, str); } len = aa_profile_snxprint(str, size, ns, profile, flags & FLAG_VIEW_SUBNS, &prev_ns); update_for_len(total, len, size, str); count++; } } if (count == 0) { if (flags & FLAG_HIDDEN_UNCONFINED) return snprintf(str, size, "%s", "unconfined"); return snprintf(str, size, "%s", aa_hidden_ns_name); } /* count == 1 && ... is for backwards compat where the mode * is not displayed for 'unconfined' in the current ns */ if (display_mode(ns, label, flags)) { len = snprintf(str, size, " (%s)", label_modename(ns, label, flags)); update_for_len(total, len, size, str); } return total; } #undef update_for_len /** * aa_label_asxprint - allocate a string buffer and print label into it * @strp: Returns - the allocated buffer with the label name. (NOT NULL) * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = kmalloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } /** * aa_label_acntsxprint - allocate a __counted string buffer and print label * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed * @gfp: kernel memory allocation type * * Returns: size of name written or would be written if larger than * available buffer */ int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { int size; AA_BUG(!strp); AA_BUG(!label); size = aa_label_snxprint(NULL, 0, ns, label, flags); if (size < 0) return size; *strp = aa_str_alloc(size + 1, gfp); if (!*strp) return -ENOMEM; return aa_label_snxprint(*strp, size + 1, ns, label, flags); } void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { const char *str; char *name = NULL; int len; AA_BUG(!ab); AA_BUG(!label); if (!use_label_hname(ns, label, flags) || display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len < 0) { AA_DEBUG(DEBUG_LABEL, "label print error"); return; } str = name; } else { str = (char *) label->hname; len = strlen(str); } if (audit_string_contains_control(str, len)) audit_log_n_hex(ab, str, len); else audit_log_n_string(ab, str, len); kfree(name); } void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!f); AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG(DEBUG_LABEL, "label print error"); return; } seq_puts(f, str); kfree(str); } else if (display_mode(ns, label, flags)) seq_printf(f, "%s (%s)", label->hname, label_modename(ns, label, flags)); else seq_puts(f, label->hname); } void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp) { AA_BUG(!label); if (!use_label_hname(ns, label, flags)) { char *str; int len; len = aa_label_asxprint(&str, ns, label, flags, gfp); if (len < 0) { AA_DEBUG(DEBUG_LABEL, "label print error"); return; } pr_info("%s", str); kfree(str); } else if (display_mode(ns, label, flags)) pr_info("%s (%s)", label->hname, label_modename(ns, label, flags)); else pr_info("%s", label->hname); } void aa_label_printk(struct aa_label *label, gfp_t gfp) { struct aa_ns *ns = aa_get_current_ns(); aa_label_xprintk(ns, label, FLAG_VIEW_SUBNS, gfp); aa_put_ns(ns); } static int label_count_strn_entries(const char *str, size_t n) { const char *end = str + n; const char *split; int count = 1; AA_BUG(!str); for (split = aa_label_strn_split(str, end - str); split; split = aa_label_strn_split(str, end - str)) { count++; str = split + 3; } return count; } /* * ensure stacks with components like * :ns:A//&B * have :ns: applied to both 'A' and 'B' by making the lookup relative * to the base if the lookup specifies an ns, else making the stacked lookup * relative to the last embedded ns in the string. */ static struct aa_profile *fqlookupn_profile(struct aa_label *base, struct aa_label *currentbase, const char *str, size_t n) { const char *first = skipn_spaces(str, n); if (first && *first == ':') return aa_fqlookupn_profile(base, str, n); return aa_fqlookupn_profile(currentbase, str, n); } /** * aa_label_strn_parse - parse, validate and convert a text string to a label * @base: base label to use for lookups (NOT NULL) * @str: null terminated text string (NOT NULL) * @n: length of str to parse, will stop at \0 if encountered before n * @gfp: allocation type * @create: true if should create compound labels if they don't exist * @force_stack: true if should stack even if no leading & * * Returns: the matching refcounted label if present * else ERRPTR */ struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack) { DEFINE_VEC(profile, vec); struct aa_label *label, *currbase = base; int i, len, stack = 0, error; const char *end = str + n; const char *split; AA_BUG(!base); AA_BUG(!str); str = skipn_spaces(str, n); if (str == NULL || (DEBUG_ABS_ROOT && *str == '_' && base != &root_ns->unconfined->label)) return ERR_PTR(-EINVAL); len = label_count_strn_entries(str, end - str); if (*str == '&' || force_stack) { /* stack on top of base */ stack = base->size; len += stack; if (*str == '&') str++; } error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); for (i = 0; i < stack; i++) vec[i] = aa_get_profile(base->vec[i]); for (split = aa_label_strn_split(str, end - str), i = stack; split && i < len; i++) { vec[i] = fqlookupn_profile(base, currbase, str, split - str); if (!vec[i]) goto fail; /* * if component specified a new ns it becomes the new base * so that subsequent lookups are relative to it */ if (vec[i]->ns != labels_ns(currbase)) currbase = &vec[i]->label; str = split + 3; split = aa_label_strn_split(str, end - str); } /* last element doesn't have a split */ if (i < len) { vec[i] = fqlookupn_profile(base, currbase, str, end - str); if (!vec[i]) goto fail; } if (len == 1) /* no need to free vec as len < LOCAL_VEC_ENTRIES */ return &vec[0]->label; len -= aa_vec_unique(vec, len, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (len == 1) { label = aa_get_label(&vec[0]->label); goto out; } if (create) label = aa_vec_find_or_create_label(vec, len, gfp); else label = vec_find(vec, len); if (!label) goto fail; out: /* use adjusted len from after vec_unique, not original */ vec_cleanup(profile, vec, len); return label; fail: label = ERR_PTR(-ENOENT); goto out; } struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack) { return aa_label_strn_parse(base, str, strlen(str), gfp, create, force_stack); } /** * aa_labelset_destroy - remove all labels from the label set * @ls: label set to cleanup (NOT NULL) * * Labels that are removed from the set may still exist beyond the set * being destroyed depending on their reference counting */ void aa_labelset_destroy(struct aa_labelset *ls) { struct rb_node *node; unsigned long flags; AA_BUG(!ls); write_lock_irqsave(&ls->lock, flags); for (node = rb_first(&ls->root); node; node = rb_first(&ls->root)) { struct aa_label *this = rb_entry(node, struct aa_label, node); if (labels_ns(this) != root_ns) __label_remove(this, ns_unconfined(labels_ns(this)->parent)); else __label_remove(this, NULL); } write_unlock_irqrestore(&ls->lock, flags); } /* * @ls: labelset to init (NOT NULL) */ void aa_labelset_init(struct aa_labelset *ls) { AA_BUG(!ls); rwlock_init(&ls->lock); ls->root = RB_ROOT; } static struct aa_label *labelset_next_stale(struct aa_labelset *ls) { struct aa_label *label; struct rb_node *node; unsigned long flags; AA_BUG(!ls); read_lock_irqsave(&ls->lock, flags); __labelset_for_each(ls, node) { label = rb_entry(node, struct aa_label, node); if ((label_is_stale(label) || vec_is_stale(label->vec, label->size)) && __aa_get_label(label)) goto out; } label = NULL; out: read_unlock_irqrestore(&ls->lock, flags); return label; } /** * __label_update - insert updated version of @label into labelset * @label: the label to update/replace * * Returns: new label that is up to date * else NULL on failure * * Requires: @ns lock be held * * Note: worst case is the stale @label does not get updated and has * to be updated at a later time. */ static struct aa_label *__label_update(struct aa_label *label) { struct aa_label *new, *tmp; struct aa_labelset *ls; unsigned long flags; int i, invcount = 0; AA_BUG(!label); AA_BUG(!mutex_is_locked(&labels_ns(label)->lock)); new = aa_label_alloc(label->size, label->proxy, GFP_KERNEL); if (!new) return NULL; /* * while holding the ns_lock will stop profile replacement, removal, * and label updates, label merging and removal can be occurring */ ls = labels_set(label); write_lock_irqsave(&ls->lock, flags); for (i = 0; i < label->size; i++) { AA_BUG(!label->vec[i]); new->vec[i] = aa_get_newest_profile(label->vec[i]); AA_BUG(!new->vec[i]); AA_BUG(!new->vec[i]->label.proxy); AA_BUG(!new->vec[i]->label.proxy->label); if (new->vec[i]->label.proxy != label->vec[i]->label.proxy) invcount++; } /* updated stale label by being removed/renamed from labelset */ if (invcount) { new->size -= aa_vec_unique(&new->vec[0], new->size, VEC_FLAG_TERMINATE); /* TODO: deal with reference labels */ if (new->size == 1) { tmp = aa_get_label(&new->vec[0]->label); AA_BUG(tmp == label); goto remove; } if (labels_set(label) != labels_set(new)) { write_unlock_irqrestore(&ls->lock, flags); tmp = aa_label_insert(labels_set(new), new); write_lock_irqsave(&ls->lock, flags); goto remove; } } else AA_BUG(labels_ns(label) != labels_ns(new)); tmp = __label_insert(labels_set(label), new, true); remove: /* ensure label is removed, and redirected correctly */ __label_remove(label, tmp); write_unlock_irqrestore(&ls->lock, flags); label_free_or_put_new(tmp, new); return tmp; } /** * __labelset_update - update labels in @ns * @ns: namespace to update labels in (NOT NULL) * * Requires: @ns lock be held * * Walk the labelset ensuring that all labels are up to date and valid * Any label that has a stale component is marked stale and replaced and * by an updated version. * * If failures happen due to memory pressures then stale labels will * be left in place until the next pass. */ static void __labelset_update(struct aa_ns *ns) { struct aa_label *label; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); do { label = labelset_next_stale(&ns->labels); if (label) { struct aa_label *l = __label_update(label); aa_put_label(l); aa_put_label(label); } } while (label); } /** * __aa_labelset_update_subtree - update all labels with a stale component * @ns: ns to start update at (NOT NULL) * * Requires: @ns lock be held * * Invalidates labels based on @p in @ns and any children namespaces. */ void __aa_labelset_update_subtree(struct aa_ns *ns) { struct aa_ns *child; AA_BUG(!ns); AA_BUG(!mutex_is_locked(&ns->lock)); __labelset_update(ns); list_for_each_entry(child, &ns->sub_ns, base.list) { mutex_lock_nested(&child->lock, child->level); __aa_labelset_update_subtree(child); mutex_unlock(&child->lock); } }
8 8 8 8 8 8 9 10 10 10 5 5 5 18 17 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mempool.c * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. * * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/writeback.h> #include "slab.h" #ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); int i; pr_err("BUG: mempool element poison mismatch\n"); pr_err("Mempool %p size %zu\n", pool, size); pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); for (i = start; i < end; i++) pr_cont("%x ", *(u8 *)(element + i)); pr_cont("%s\n", end < size ? "..." : ""); dump_stack(); } static void __check_element(mempool_t *pool, void *element, size_t size) { u8 *obj = element; size_t i; for (i = 0; i < size; i++) { u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; if (obj[i] != exp) { poison_error(pool, element, size, i); return; } } memset(obj, POISON_INUSE, size); } static void check_element(mempool_t *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->free == mempool_kfree) { __check_element(pool, element, (size_t)pool->pool_data); } else if (pool->free == mempool_free_slab) { __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } static void __poison_element(void *element, size_t size) { u8 *obj = element; memset(obj, POISON_FREE, size - 1); obj[size - 1] = POISON_END; } static void poison_element(mempool_t *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->alloc == mempool_kmalloc) { __poison_element(element, (size_t)pool->pool_data); } else if (pool->alloc == mempool_alloc_slab) { __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __poison_element(addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } #else /* CONFIG_SLUB_DEBUG_ON */ static inline void check_element(mempool_t *pool, void *element) { } static inline void poison_element(mempool_t *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); else if (pool->alloc == mempool_alloc_pages) return kasan_mempool_poison_pages(element, (unsigned long)pool->pool_data); return true; } static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); else if (pool->alloc == mempool_alloc_slab) kasan_mempool_unpoison_object(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_mempool_unpoison_pages(element, (unsigned long)pool->pool_data); } static __always_inline void add_element(mempool_t *pool, void *element) { BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; } static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); kasan_unpoison_element(pool, element); check_element(pool, element); return element; } /** * mempool_exit - exit a mempool initialized with mempool_init() * @pool: pointer to the memory pool which was initialized with * mempool_init(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. * * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); pool->elements = NULL; } EXPORT_SYMBOL(mempool_exit); /** * mempool_destroy - deallocate a memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ void mempool_destroy(mempool_t *pool) { if (unlikely(!pool)) return; mempool_exit(pool); kfree(pool); } EXPORT_SYMBOL(mempool_destroy); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; pool->pool_data = pool_data; pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); /* * max() used here to ensure storage for at least 1 element to support * zero minimum pool */ pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* * First pre-allocate the guaranteed number of buffers, * also pre-allocate 1 element for zero minimum pool. */ while (pool->curr_nr < max(1, pool->min_nr)) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); if (unlikely(!element)) { mempool_exit(pool); return -ENOMEM; } add_element(pool, element); } return 0; } EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). * * Return: %0 on success, negative error code otherwise. */ int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_init_noprof); /** * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * @gfp_mask: memory allocation flags * @node_id: numa node to allocate on * * this function creates and allocates a guaranteed size, preallocated * memory pool. The pool can be used from the mempool_alloc() and mempool_free() * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. * * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { mempool_t *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, gfp_mask, node_id)) { kfree(pool); return NULL; } return pool; } EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. * * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); } pool->min_nr = new_min_nr; goto out_unlock; } spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), GFP_KERNEL); if (!new_elements) return -ENOMEM; spin_lock_irqsave(&pool->lock, flags); if (unlikely(new_min_nr <= pool->min_nr)) { /* Raced, other resize will do our work */ spin_unlock_irqrestore(&pool->lock, flags); kfree(new_elements); goto out; } memcpy(new_elements, pool->elements, pool->curr_nr * sizeof(*new_elements)); kfree(pool->elements); pool->elements = new_elements; pool->min_nr = new_min_nr; while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { add_element(pool, element); } else { spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); /* Raced */ goto out; } } out_unlock: spin_unlock_irqrestore(&pool->lock, flags); out: return 0; } EXPORT_SYMBOL(mempool_resize); /** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @gfp_mask: the usual allocation bitmask. * * this function only sleeps if the alloc_fn() function sleeps or * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. * * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) return element; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } /* * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } /* Let's wait for someone else to return an element to @pool */ init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pool->lock, flags); /* * FIXME: this should be io_schedule(). The timeout is there as a * workaround for some DM problems in 2.6.18. */ io_schedule_timeout(5*HZ); finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements * belonging to a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * This function is similar to mempool_alloc, but it only attempts allocating * an element from the preallocated elements. It does not sleep and immediately * returns if no preallocated elements are available. * * Return: pointer to the allocated element or %NULL if no elements are * available. */ void *mempool_alloc_preallocated(mempool_t *pool) { void *element; unsigned long flags; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } spin_unlock_irqrestore(&pool->lock, flags); return NULL; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** * mempool_free - return an element to the pool. * @element: pool element pointer. * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * this function only sleeps if the free_fn() function sleeps. */ void mempool_free(void *element, mempool_t *pool) { unsigned long flags; if (unlikely(element == NULL)) return; /* * Paired with the wmb in mempool_alloc(). The preceding read is * for @element and the following @pool->curr_nr. This ensures * that the visible value of @pool->curr_nr is from after the * allocation of @element. This is necessary for fringe cases * where @element was passed to this task without going through * barriers. * * For example, assume @p is %NULL at the beginning and one task * performs "p = mempool_alloc(...);" while another task is doing * "while (!p) cpu_relax(); mempool_free(p, ...);". This function * may end up using curr_nr value which is from before allocation * of @p without the following rmb. */ smp_rmb(); /* * For correctness, we need a test which is guaranteed to trigger * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr * without locking achieves that and refilling as soon as possible * is desirable. * * Because curr_nr visible here is always a value after the * allocation of @element, any task which decremented curr_nr below * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets * incremented to min_nr afterwards. If curr_nr gets incremented * to min_nr after the allocation of @element, the elements * allocated after that are subject to the same guarantee. * * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. */ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); if (wq_has_sleeper(&pool->wait)) wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } /* * Handle the min_nr = 0 edge case: * * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, * so waiters sleeping on pool->wait would never be woken by the * wake-up path of previous test. This explicit check ensures the * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. */ if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); if (wq_has_sleeper(&pool->wait)) wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); /* * A commonly used alloc and free fn. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); /* * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory * specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void mempool_kfree(void *element, void *pool_data) { kfree(element); } EXPORT_SYMBOL(mempool_kfree); void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kvmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kvmalloc); void mempool_kvfree(void *element, void *pool_data) { kvfree(element); } EXPORT_SYMBOL(mempool_kvfree); /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); void mempool_free_pages(void *element, void *pool_data) { int order = (int)(long)pool_data; __free_pages(element, order); } EXPORT_SYMBOL(mempool_free_pages);
18824 18824 357 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PAGE_EXT_H #define __LINUX_PAGE_EXT_H #include <linux/types.h> #include <linux/mmzone.h> #include <linux/stacktrace.h> struct pglist_data; #ifdef CONFIG_PAGE_EXTENSION /** * struct page_ext_operations - per page_ext client operations * @offset: Offset to the client's data within page_ext. Offset is returned to * the client by page_ext_init. * @size: The size of the client data within page_ext. * @need: Function that returns true if client requires page_ext. * @init: (optional) Called to initialize client once page_exts are allocated. * @need_shared_flags: True when client is using shared page_ext->flags * field. * * Each Page Extension client must define page_ext_operations in * page_ext_ops array. */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); bool need_shared_flags; }; /* * The page_ext_flags users must set need_shared_flags to true. */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif }; /* * Page Extension can be considered as an extended mem_map. * A page_ext page is associated with every page descriptor. The * page_ext helps us add more information about the page. * All page_ext are allocated at boot or memory hotplug event, * then the page_ext for pfn always exists. */ struct page_ext { unsigned long flags; }; extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); static inline bool early_page_ext_enabled(void) { return early_page_ext; } #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { /* * page_ext is allocated per memory section. Once we cross a * memory section, we have to fetch the new pointer. */ return next_pfn % PAGES_PER_SECTION; } #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { return true; } #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) { return (void *)(page_ext) + ops->offset; } static inline struct page_ext *page_ext_next(struct page_ext *curr) { void *next = curr; next += page_ext_size; return next; } struct page_ext_iter { unsigned long index; unsigned long start_pfn; struct page_ext *page_ext; }; /** * page_ext_iter_begin() - Prepare for iterating through page extensions. * @iter: page extension iterator. * @pfn: PFN of the page we're interested in. * * Must be called with RCU read lock taken. * * Return: NULL if no page_ext exists for this page. */ static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, unsigned long pfn) { iter->index = 0; iter->start_pfn = pfn; iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_next() - Get next page extension * @iter: page extension iterator. * * Must be called with RCU read lock taken. * * Return: NULL if no next page_ext exists. */ static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) { unsigned long pfn; if (WARN_ON_ONCE(!iter->page_ext)) return NULL; iter->index++; pfn = iter->start_pfn + iter->index; if (page_ext_iter_next_fast_possible(pfn)) iter->page_ext = page_ext_next(iter->page_ext); else iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_get() - Get current page extension * @iter: page extension iterator. * * Return: NULL if no page_ext exists for this iterator. */ static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) { return iter->page_ext; } /** * for_each_page_ext(): iterate through page_ext objects. * @__page: the page we're interested in * @__pgcount: how many pages to iterate through * @__page_ext: struct page_ext pointer where the current page_ext * object is returned * @__iter: struct page_ext_iter object (defined in the stack) * * IMPORTANT: must be called with RCU read lock taken. */ #define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ __page_ext && __iter.index < __pgcount; \ __page_ext = page_ext_iter_next(&__iter)) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; static inline bool early_page_ext_enabled(void) { return false; } static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } static inline void page_ext_init(void) { } static inline void page_ext_init_flatmem_late(void) { } static inline void page_ext_init_flatmem(void) { } static inline struct page_ext *page_ext_get(const struct page *page) { return NULL; } static inline void page_ext_put(struct page_ext *page_ext) { } #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */
4 11 8 6 2 10 2 1 11 27 7 26 14 13 26 1 4 19 17 2 8 1 10 8 2 2 2 1 1 6 2 3 2 20 17 3 7 10 14 12 8 2 2 1 1 18 2 5 15 25 25 10 10 7 5 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 // SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface */ #include <linux/sysctl.h> #include <linux/bitmap.h> #include <linux/proc_fs.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/initrd.h> #include <linux/times.h> #include <linux/limits.h> #include <linux/syscalls.h> #include <linux/capability.h> #include "../lib/kstrtox.h" #include <linux/uaccess.h> #include <asm/processor.h> /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; #ifdef CONFIG_PROC_SYSCTL /** * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value * to be written, and multiple writes on the same sysctl file descriptor * will rewrite the sysctl value, regardless of file position. No warning * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at * file position 0 and the value must be fully contained in the buffer * sent to the write syscall. If dealing with strings respect the file * position, but restrict this to the max length of the buffer, anything * passed the max length will be ignored. Multiple writes will append * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. */ enum sysctl_writes_mode { SYSCTL_WRITES_LEGACY = -1, SYSCTL_WRITES_WARN = 0, SYSCTL_WRITES_STRICT = 1, }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SYSCTL */ /* * /proc/sys support */ #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, char *buffer, size_t *lenp, loff_t *ppos) { size_t len; char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; return 0; } if (write) { if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { /* Only continue writes not past the end of buffer. */ len = strlen(data); if (len > maxlen - 1) len = maxlen - 1; if (*ppos > len) return 0; len = *ppos; } else { /* Start writing from beginning of buffer. */ len = 0; } *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; } data[len] = 0; } else { len = strlen(data); if (len > maxlen) len = maxlen; if (*ppos > len) { *lenp = 0; return 0; } data += *ppos; len -= *ppos; if (len > *lenp) len = *lenp; if (len) memcpy(buffer, data, len); if (len < *lenp) { buffer[len] = '\n'; len++; } *lenp = len; *ppos += len; } return 0; } static void warn_sysctl_write(const struct ctl_table *table) { pr_warn_once("%s wrote to %s when file position was not 0!\n" "This will not be supported in the future. To silence this\n" "warning, set kernel.sysctl_writes_strict = -1\n", current->comm, table->procname); } /** * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, const struct ctl_table *table) { if (!*ppos) return false; switch (sysctl_writes_strict) { case SYSCTL_WRITES_STRICT: return true; case SYSCTL_WRITES_WARN: warn_sysctl_write(table); return false; default: return false; } } /** * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes a string from/to the user buffer. If the kernel * buffer provided is not large enough to hold the string, the * string is truncated. The copied string is %NULL-terminated. * If the string is being read by the user process, it is copied * and a newline '\n' is added. It is truncated if the buffer is * not large enough. * * Returns 0 on success. */ int proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } static void proc_skip_spaces(char **buf, size_t *size) { while (*size) { if (!isspace(**buf)) break; (*size)--; (*buf)++; } } static void proc_skip_char(char **buf, size_t *size, const char v) { while (*size) { if (**buf != v) break; (*size)--; (*buf)++; } } /** * strtoul_lenient - parse an ASCII formatted integer from a buffer and only * fail on overflow * * @cp: kernel buffer containing the string to parse * @endp: pointer to store the trailing characters * @base: the base to use * @res: where the parsed integer will be stored * * In case of success 0 is returned and @res will contain the parsed integer, * @endp will hold any trailing characters. * This function will fail the parse on overflow. If there wasn't an overflow * the function will defer the decision what characters count as invalid to the * caller. */ static int strtoul_lenient(const char *cp, char **endp, unsigned int base, unsigned long *res) { unsigned long long result; unsigned int rv; cp = _parse_integer_fixup_radix(cp, &base); rv = _parse_integer(cp, base, &result); if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) return -ERANGE; cp += rv; if (endp) *endp = (char *)cp; *res = (unsigned long)result; return 0; } #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer * * @buf: a kernel buffer * @size: size of the kernel buffer * @val: this is where the number will be stored * @neg: set to %TRUE if number is negative * @perm_tr: a vector which contains the allowed trailers * @perm_tr_len: size of the perm_tr vector * @tr: pointer to store the trailer character * * In case of success %0 is returned and @buf and @size are updated with * the amount of bytes read. If @tr is non-NULL and a trailing * character exists (size is non-zero after returning from this * function), @tr is updated with the trailing character. */ static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, const char *perm_tr, unsigned perm_tr_len, char *tr) { char *p, tmp[TMPBUFLEN]; ssize_t len = *size; if (len <= 0) return -EINVAL; if (len > TMPBUFLEN - 1) len = TMPBUFLEN - 1; memcpy(tmp, *buf, len); tmp[len] = 0; p = tmp; if (*p == '-' && *size > 1) { *neg = true; p++; } else *neg = false; if (!isdigit(*p)) return -EINVAL; if (strtoul_lenient(p, &p, 0, val)) return -EINVAL; len = p - tmp; /* We don't know if the next char is whitespace thus we may accept * invalid integers (e.g. 1234...a) or two integers instead of one * (e.g. 123...1). So lets not allow such large numbers. */ if (len == TMPBUFLEN - 1) return -EINVAL; if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) return -EINVAL; if (tr && (len < *size)) *tr = *p; *buf += len; *size -= len; return 0; } /** * proc_put_long - converts an integer to a decimal ASCII formatted string * * @buf: the user buffer * @size: the size of the user buffer * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * * In case of success @buf and @size are updated with the amount of bytes * written. */ static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; sprintf(p, "%s%lu", neg ? "-" : "", val); len = strlen(tmp); if (len > *size) len = *size; memcpy(*buf, tmp, len); *size -= len; *buf += len; } #undef TMPBUFLEN static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { char **buffer = (char **)buf; **buffer = c; (*size)--; (*buffer)++; *buf = *buffer; } } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; WRITE_ONCE(*valp, *lvalp); } } else { int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; } } return 0; } static int do_proc_douintvec_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { if (write) { if (*lvalp > UINT_MAX) return -EINVAL; WRITE_ONCE(*valp, *lvalp); } else { unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; } static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { int *i, vleft, first = 1, err = 0; size_t left; char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; if (!conv) conv = do_proc_dointvec_conv; if (write) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; p = buffer; } for (; left && vleft--; i++, first=0) { unsigned long lval; bool neg; if (write) { proc_skip_spaces(&p, &left); if (!left) break; err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) break; if (conv(&neg, &lval, i, 1, data)) { err = -EINVAL; break; } } else { if (conv(&neg, &lval, i, 0, data)) { err = -EINVAL; break; } if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err && left) proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; return err; } static int do_proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } static int do_proc_douintvec_w(unsigned int *tbl_data, const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned long lval; int err = 0; size_t left; bool neg; char *p = buffer; left = *lenp; if (proc_first_pos_non_zero_ignore(ppos, table)) goto bail_early; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; proc_skip_spaces(&p, &left); if (!left) { err = -EINVAL; goto out_free; } err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err || neg) { err = -EINVAL; goto out_free; } if (conv(&lval, tbl_data, 1, data)) { err = -EINVAL; goto out_free; } if (!err && left) proc_skip_spaces(&p, &left); out_free: if (err) return -EINVAL; return 0; /* This is in keeping with old __do_proc_dointvec() */ bail_early: *ppos += *lenp; return err; } static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned long lval; int err = 0; size_t left; left = *lenp; if (conv(&lval, tbl_data, 0, data)) { err = -EINVAL; goto out; } proc_put_long(&buffer, &left, lval, false); if (!left) goto out; proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; *ppos += *lenp; return err; } static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned int *i, vleft; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = (unsigned int *) tbl_data; vleft = table->maxlen / sizeof(*i); /* * Arrays are not supported, keep this simple. *Do not* add * support for them. */ if (vleft != 1) { *lenp = 0; return -EINVAL; } if (!conv) conv = do_proc_douintvec_conv; if (write) return do_proc_douintvec_w(i, table, buffer, lenp, ppos, conv, data); return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } int do_proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); } /** * proc_dobool - read/write a bool * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes one integer value from/to the user buffer, * treated as an ASCII string. * * table->data must point to a bool variable and table->maxlen must * be sizeof(bool). * * Returns 0 on success. */ int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; bool *data = table->data; int res, val; /* Do not support arrays yet. */ if (table->maxlen != sizeof(bool)) return -EINVAL; tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; val = READ_ONCE(*data); res = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (res) return res; if (write) WRITE_ONCE(*data, val); return 0; } /** * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ int proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ int proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); } /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value * @max: pointer to maximum allowable value * * The do_proc_dointvec_minmax_conv_param structure provides the * minimum and maximum values for doing range checking for those sysctl * parameters that use the proc_dointvec_minmax() handler. */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; }; static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; /* * If writing, first do so via a temporary local int so we can * bounds-check it before touching *valp. */ int *ip = write ? &tmp : valp; ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; WRITE_ONCE(*valp, tmp); } return 0; } /** * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, &param); } /** * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure * @min: pointer to minimum allowable value * @max: pointer to maximum allowable value * * The do_proc_douintvec_minmax_conv_param structure provides the * minimum and maximum values for doing range checking for those sysctl * parameters that use the proc_douintvec_minmax() handler. */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; }; static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { int ret; unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; /* write via temporary local uint for bounds-checking */ unsigned int *up = write ? &tmp : valp; ret = do_proc_douintvec_conv(lvalp, up, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -ERANGE; WRITE_ONCE(*valp, tmp); } return 0; } /** * proc_douintvec_minmax - read a vector of unsigned ints with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer * values from/to the user buffer, treated as an ASCII string. Negative * strings are not allowed. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). There is a final sanity * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, .max = (unsigned int *) table->extra2, }; return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, &param); } /** * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars * values from/to the user buffer, treated as an ASCII string. Negative * strings are not allowed. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success or an error on write when the range check fails. */ int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; unsigned int min = 0, max = 255U, val; u8 *data = table->data; struct do_proc_douintvec_minmax_conv_param param = { .min = &min, .max = &max, }; int res; /* Do not support arrays yet. */ if (table->maxlen != sizeof(u8)) return -EINVAL; if (table->extra1) min = *(unsigned int *) table->extra1; if (table->extra2) max = *(unsigned int *) table->extra2; tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, &param); if (res) return res; if (write) WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); static int __do_proc_doulongvec_minmax(void *data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = data; min = table->extra1; max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; if (write) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; p = buffer; } for (; left && vleft--; i++, first = 0) { unsigned long val; if (write) { bool neg; proc_skip_spaces(&p, &left); if (!left) break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err || neg) { err = -EINVAL; break; } val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; break; } WRITE_ONCE(*i, val); } else { val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err) proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; return err; } static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long * values from/to the user buffer, treated as an ASCII string. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success. */ int proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long * values from/to the user buffer, treated as an ASCII string. The values * are treated as milliseconds, and converted to jiffies when they are stored. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (*lvalp > INT_MAX / HZ) return 1; if (*negp) WRITE_ONCE(*valp, -*lvalp * HZ); else WRITE_ONCE(*valp, *lvalp * HZ); } else { int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = lval / HZ; } return 0; } static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) return 1; *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); } else { int val = *valp; unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_clock_t(lval); } return 0; } static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); if (jif > INT_MAX) return 1; WRITE_ONCE(*valp, (int)jif); } else { int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_msecs(lval); } return 0; } static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; /* * If writing, first do so via a temporary local int so we can * bounds-check it before touching *valp. */ int *ip = write ? &tmp : valp; ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; *valp = tmp; } return 0; } /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * * Returns 0 on success. */ int proc_dointvec_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_minmax_conv, &param); } /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_userhz_jiffies_conv, NULL); } /** * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. */ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * The bitmap is stored at table->data and the bitmap length (in bits) * in table->maxlen. * * We use a range comma separated format (e.g. 1,3-4,10-10) so that * large bitmaps may be represented in a compact manner. Writing into * the file will clear the bitmap then update it with the given input. * * Returns 0 on success. */ int proc_do_large_bitmap(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } if (write) { char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; /* How much of the buffer we'll skip this pass */ skipped = *lenp - left; } tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) return -ENOMEM; proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; size_t saved_left; /* In case we stop parsing mid-number, we can reset */ saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); /* * If we consumed the entirety of a truncated buffer or * only one char is left (may be a "-"), then stop here, * reset, & come back for more. */ if ((left <= 1) && skipped) { left = saved_left; break; } if (err) break; if (val_a >= bitmap_len || neg) { err = -EINVAL; break; } val_b = val_a; if (left) { p++; left--; } if (c == '-') { err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); /* * If we consumed all of a truncated buffer or * then stop here, reset, & come back for more. */ if (!left && skipped) { left = saved_left; break; } if (err) break; if (val_b >= bitmap_len || neg || val_a > val_b) { err = -EINVAL; break; } if (left) { p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); proc_skip_char(&p, &left, '\n'); } left += skipped; } else { unsigned long bit_a, bit_b = 0; bool first = 1; while (left) { bit_a = find_next_bit(bitmap, bitmap_len, bit_b); if (bit_a >= bitmap_len) break; bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; if (!first) proc_put_char(&buffer, &left, ','); proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { proc_put_char(&buffer, &left, '-'); proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } proc_put_char(&buffer, &left, '\n'); } if (!err) { if (write) { if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } *lenp -= left; *ppos += *lenp; } bitmap_free(tmp_bitmap); return err; } #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_do_large_bitmap(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } #endif /* CONFIG_PROC_SYSCTL */ #if defined(CONFIG_SYSCTL) int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); int val, ret; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&static_key_mutex); val = static_key_enabled(key); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (val) static_key_enable(key); else static_key_disable(key); } mutex_unlock(&static_key_mutex); return ret; } static const struct ctl_table sysctl_subsys_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "sysctl_writes_strict", .data = &sysctl_writes_strict, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif { .procname = "ngroups_max", .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "cap_last_cap", .data = (void *)&cap_last_cap, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", .data = &unaligned_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; int __init sysctl_init_bases(void) { register_sysctl_init("kernel", sysctl_subsys_table); return 0; } #endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, * exception granted :-) */ EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(proc_do_large_bitmap);
5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/list.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/srcu.h> #include <linux/rculist.h> #include <linux/wait.h> #include <linux/memcontrol.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #include <linux/atomic.h> /* * Final freeing of a group */ static void fsnotify_final_destroy_group(struct fsnotify_group *group) { if (group->ops->free_group_priv) group->ops->free_group_priv(group); mem_cgroup_put(group->memcg); mutex_destroy(&group->mark_mutex); kfree(group); } /* * Stop queueing new events for this group. Once this function returns * fsnotify_add_event() will not add any new events to the group's queue. */ void fsnotify_group_stop_queueing(struct fsnotify_group *group) { spin_lock(&group->notification_lock); group->shutdown = true; spin_unlock(&group->notification_lock); } /* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. * Note that another thread calling fsnotify_clear_marks_by_group() may still * hold a ref to the group. */ void fsnotify_destroy_group(struct fsnotify_group *group) { /* * Stop queueing new events. The code below is careful enough to not * require this but fanotify needs to stop queuing events even before * fsnotify_destroy_group() is called and this makes the other callers * of fsnotify_destroy_group() to see the same behavior. */ fsnotify_group_stop_queueing(group); /* Clear all marks for this group and queue them for destruction */ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_ANY); /* * Some marks can still be pinned when waiting for response from * userspace. Wait for those now. fsnotify_prepare_user_wait() will * not succeed now so this wait is race-free. */ wait_event(group->notification_waitq, !atomic_read(&group->user_waits)); /* * Wait until all marks get really destroyed. We could actually destroy * them ourselves instead of waiting for worker to do it, however that * would be racy as worker can already be processing some marks before * we even entered fsnotify_destroy_group(). */ fsnotify_wait_marks_destroyed(); /* * Since we have waited for fsnotify_mark_srcu in * fsnotify_mark_destroy_list() there can be no outstanding event * notification against this group. So clearing the notification queue * of all events is reliable now. */ fsnotify_flush_notify(group); /* * Destroy overflow event (we cannot use fsnotify_destroy_event() as * that deliberately ignores overflow events. */ if (group->overflow_event) group->ops->free_event(group, group->overflow_event); fsnotify_put_group(group); } /* * Get reference to a group. */ void fsnotify_get_group(struct fsnotify_group *group) { refcount_inc(&group->refcnt); } /* * Drop a reference to a group. Free it if it's through. */ void fsnotify_put_group(struct fsnotify_group *group) { if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } EXPORT_SYMBOL_GPL(fsnotify_put_group); static struct fsnotify_group *__fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags, gfp_t gfp) { struct fsnotify_group *group; group = kzalloc(sizeof(struct fsnotify_group), gfp); if (!group) return ERR_PTR(-ENOMEM); /* set to 0 when there a no external references to this group */ refcount_set(&group->refcnt, 1); atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; mutex_init(&group->mark_mutex); INIT_LIST_HEAD(&group->marks_list); group->ops = ops; group->flags = flags; return group; } /* * Create a new fsnotify_group and hold a reference for the group returned. */ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops, int flags) { gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT : GFP_KERNEL; return __fsnotify_alloc_group(ops, flags, gfp); } EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { struct fsnotify_group *group = file->private_data; return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO; }
3 3 2 1 1 1 134 134 131 17 1 1 1 2 2 1 2 1 1 8 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> * Copyright 2025 Google LLC */ #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/kernel.h> #include <linux/module.h> /* * Export and import functions. crypto_shash wants a particular format that * matches that used by some legacy drivers. It currently is the same as the * library SHA context, except the value in bytecount must be block-aligned and * the remainder must be stored in an extra u8 appended to the struct. */ #define SHA256_SHASH_STATE_SIZE 105 static_assert(offsetof(struct __sha256_ctx, state) == 0); static_assert(offsetof(struct __sha256_ctx, bytecount) == 32); static_assert(offsetof(struct __sha256_ctx, buf) == 40); static_assert(sizeof(struct __sha256_ctx) + 1 == SHA256_SHASH_STATE_SIZE); static int __crypto_sha256_export(const struct __sha256_ctx *ctx0, void *out) { struct __sha256_ctx ctx = *ctx0; unsigned int partial; u8 *p = out; partial = ctx.bytecount % SHA256_BLOCK_SIZE; ctx.bytecount -= partial; memcpy(p, &ctx, sizeof(ctx)); p += sizeof(ctx); *p = partial; return 0; } static int __crypto_sha256_import(struct __sha256_ctx *ctx, const void *in) { const u8 *p = in; memcpy(ctx, p, sizeof(*ctx)); p += sizeof(*ctx); ctx->bytecount += *p; return 0; } static int __crypto_sha256_export_core(const struct __sha256_ctx *ctx, void *out) { memcpy(out, ctx, offsetof(struct __sha256_ctx, buf)); return 0; } static int __crypto_sha256_import_core(struct __sha256_ctx *ctx, const void *in) { memcpy(ctx, in, offsetof(struct __sha256_ctx, buf)); return 0; } /* SHA-224 */ const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4, 0x2f }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); #define SHA224_CTX(desc) ((struct sha224_ctx *)shash_desc_ctx(desc)) static int crypto_sha224_init(struct shash_desc *desc) { sha224_init(SHA224_CTX(desc)); return 0; } static int crypto_sha224_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha224_update(SHA224_CTX(desc), data, len); return 0; } static int crypto_sha224_final(struct shash_desc *desc, u8 *out) { sha224_final(SHA224_CTX(desc), out); return 0; } static int crypto_sha224_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha224(data, len, out); return 0; } static int crypto_sha224_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&SHA224_CTX(desc)->ctx, out); } static int crypto_sha224_import(struct shash_desc *desc, const void *in) { return __crypto_sha256_import(&SHA224_CTX(desc)->ctx, in); } static int crypto_sha224_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&SHA224_CTX(desc)->ctx, out); } static int crypto_sha224_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha256_import_core(&SHA224_CTX(desc)->ctx, in); } /* SHA-256 */ const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); #define SHA256_CTX(desc) ((struct sha256_ctx *)shash_desc_ctx(desc)) static int crypto_sha256_init(struct shash_desc *desc) { sha256_init(SHA256_CTX(desc)); return 0; } static int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha256_update(SHA256_CTX(desc), data, len); return 0; } static int crypto_sha256_final(struct shash_desc *desc, u8 *out) { sha256_final(SHA256_CTX(desc), out); return 0; } static int crypto_sha256_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha256(data, len, out); return 0; } static int crypto_sha256_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&SHA256_CTX(desc)->ctx, out); } static int crypto_sha256_import(struct shash_desc *desc, const void *in) { return __crypto_sha256_import(&SHA256_CTX(desc)->ctx, in); } static int crypto_sha256_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&SHA256_CTX(desc)->ctx, out); } static int crypto_sha256_import_core(struct shash_desc *desc, const void *in) { return __crypto_sha256_import_core(&SHA256_CTX(desc)->ctx, in); } /* HMAC-SHA224 */ #define HMAC_SHA224_KEY(tfm) ((struct hmac_sha224_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA224_CTX(desc) ((struct hmac_sha224_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha224_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha224_preparekey(HMAC_SHA224_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha224_init(struct shash_desc *desc) { hmac_sha224_init(HMAC_SHA224_CTX(desc), HMAC_SHA224_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha224_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha224_update(HMAC_SHA224_CTX(desc), data, len); return 0; } static int crypto_hmac_sha224_final(struct shash_desc *desc, u8 *out) { hmac_sha224_final(HMAC_SHA224_CTX(desc), out); return 0; } static int crypto_hmac_sha224_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha224(HMAC_SHA224_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha224_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha224_import(struct shash_desc *desc, const void *in) { struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc); ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); } static int crypto_hmac_sha224_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&HMAC_SHA224_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha224_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha224_ctx *ctx = HMAC_SHA224_CTX(desc); ctx->ctx.ostate = HMAC_SHA224_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in); } /* HMAC-SHA256 */ #define HMAC_SHA256_KEY(tfm) ((struct hmac_sha256_key *)crypto_shash_ctx(tfm)) #define HMAC_SHA256_CTX(desc) ((struct hmac_sha256_ctx *)shash_desc_ctx(desc)) static int crypto_hmac_sha256_setkey(struct crypto_shash *tfm, const u8 *raw_key, unsigned int keylen) { hmac_sha256_preparekey(HMAC_SHA256_KEY(tfm), raw_key, keylen); return 0; } static int crypto_hmac_sha256_init(struct shash_desc *desc) { hmac_sha256_init(HMAC_SHA256_CTX(desc), HMAC_SHA256_KEY(desc->tfm)); return 0; } static int crypto_hmac_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { hmac_sha256_update(HMAC_SHA256_CTX(desc), data, len); return 0; } static int crypto_hmac_sha256_final(struct shash_desc *desc, u8 *out) { hmac_sha256_final(HMAC_SHA256_CTX(desc), out); return 0; } static int crypto_hmac_sha256_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { hmac_sha256(HMAC_SHA256_KEY(desc->tfm), data, len, out); return 0; } static int crypto_hmac_sha256_export(struct shash_desc *desc, void *out) { return __crypto_sha256_export(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha256_import(struct shash_desc *desc, const void *in) { struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc); ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import(&ctx->ctx.sha_ctx, in); } static int crypto_hmac_sha256_export_core(struct shash_desc *desc, void *out) { return __crypto_sha256_export_core(&HMAC_SHA256_CTX(desc)->ctx.sha_ctx, out); } static int crypto_hmac_sha256_import_core(struct shash_desc *desc, const void *in) { struct hmac_sha256_ctx *ctx = HMAC_SHA256_CTX(desc); ctx->ctx.ostate = HMAC_SHA256_KEY(desc->tfm)->key.ostate; return __crypto_sha256_import_core(&ctx->ctx.sha_ctx, in); } /* Algorithm definitions */ static struct shash_alg algs[] = { { .base.cra_name = "sha224", .base.cra_driver_name = "sha224-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, .init = crypto_sha224_init, .update = crypto_sha224_update, .final = crypto_sha224_final, .digest = crypto_sha224_digest, .export = crypto_sha224_export, .import = crypto_sha224_import, .export_core = crypto_sha224_export_core, .import_core = crypto_sha224_import_core, .descsize = sizeof(struct sha224_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "sha256", .base.cra_driver_name = "sha256-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, .init = crypto_sha256_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .digest = crypto_sha256_digest, .export = crypto_sha256_export, .import = crypto_sha256_import, .export_core = crypto_sha256_export_core, .import_core = crypto_sha256_import_core, .descsize = sizeof(struct sha256_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha224)", .base.cra_driver_name = "hmac-sha224-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA224_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha224_key), .base.cra_module = THIS_MODULE, .digestsize = SHA224_DIGEST_SIZE, .setkey = crypto_hmac_sha224_setkey, .init = crypto_hmac_sha224_init, .update = crypto_hmac_sha224_update, .final = crypto_hmac_sha224_final, .digest = crypto_hmac_sha224_digest, .export = crypto_hmac_sha224_export, .import = crypto_hmac_sha224_import, .export_core = crypto_hmac_sha224_export_core, .import_core = crypto_hmac_sha224_import_core, .descsize = sizeof(struct hmac_sha224_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, { .base.cra_name = "hmac(sha256)", .base.cra_driver_name = "hmac-sha256-lib", .base.cra_priority = 300, .base.cra_blocksize = SHA256_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct hmac_sha256_key), .base.cra_module = THIS_MODULE, .digestsize = SHA256_DIGEST_SIZE, .setkey = crypto_hmac_sha256_setkey, .init = crypto_hmac_sha256_init, .update = crypto_hmac_sha256_update, .final = crypto_hmac_sha256_final, .digest = crypto_hmac_sha256_digest, .export = crypto_hmac_sha256_export, .import = crypto_hmac_sha256_import, .export_core = crypto_hmac_sha256_export_core, .import_core = crypto_hmac_sha256_import_core, .descsize = sizeof(struct hmac_sha256_ctx), .statesize = SHA256_SHASH_STATE_SIZE, }, }; static int __init crypto_sha256_mod_init(void) { return crypto_register_shashes(algs, ARRAY_SIZE(algs)); } module_init(crypto_sha256_mod_init); static void __exit crypto_sha256_mod_exit(void) { crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } module_exit(crypto_sha256_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto API support for SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256"); MODULE_ALIAS_CRYPTO("sha224"); MODULE_ALIAS_CRYPTO("sha224-lib"); MODULE_ALIAS_CRYPTO("sha256"); MODULE_ALIAS_CRYPTO("sha256-lib"); MODULE_ALIAS_CRYPTO("hmac(sha224)"); MODULE_ALIAS_CRYPTO("hmac-sha224-lib"); MODULE_ALIAS_CRYPTO("hmac(sha256)"); MODULE_ALIAS_CRYPTO("hmac-sha256-lib");
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match running CPU */ /* * Might be used to distribute connections on several daemons, if * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, * each RX queue IRQ affined to one CPU (1:1 mapping) */ /* (C) 2010 Eric Dumazet */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/xt_cpu.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); MODULE_DESCRIPTION("Xtables: CPU match"); MODULE_ALIAS("ipt_cpu"); MODULE_ALIAS("ip6t_cpu"); static int cpu_mt_check(const struct xt_mtchk_param *par) { const struct xt_cpu_info *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; return 0; } static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cpu_info *info = par->matchinfo; return (info->cpu == smp_processor_id()) ^ info->invert; } static struct xt_match cpu_mt_reg __read_mostly = { .name = "cpu", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = cpu_mt_check, .match = cpu_mt, .matchsize = sizeof(struct xt_cpu_info), .me = THIS_MODULE, }; static int __init cpu_mt_init(void) { return xt_register_match(&cpu_mt_reg); } static void __exit cpu_mt_exit(void) { xt_unregister_match(&cpu_mt_reg); } module_init(cpu_mt_init); module_exit(cpu_mt_exit);
9 3 4 2 2 3 3 3 1 2 1 1 8 8 8 2780 2769 28 9 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* Copyright 2011, Siemens AG * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ /* Based on patches from Jon Smirl <jonsmirl@gmail.com> * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* Jon's code is based on 6lowpan implementation for Contiki which is: * Copyright (c) 2008, Swedish Institute of Computer Science. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <linux/if_arp.h> #include <net/ipv6.h> #include <net/netdev_lock.h> #include "6lowpan_i.h" static int open_count; static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; static int lowpan_dev_init(struct net_device *ldev) { netdev_lockdep_set_classes(ldev); return 0; } static int lowpan_open(struct net_device *dev) { if (!open_count) lowpan_rx_init(); open_count++; return 0; } static int lowpan_stop(struct net_device *dev) { open_count--; if (!open_count) lowpan_rx_exit(); return 0; } static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); /* default no short_addr is available for a neighbour */ neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); return 0; } static int lowpan_get_iflink(const struct net_device *dev) { return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex); } static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) { memset(ldev->broadcast, 0xff, IEEE802154_ADDR_LEN); /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; ldev->netns_immutable = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN) return -EINVAL; } return 0; } static int lowpan_newlink(struct net_device *ldev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **tb = params->tb; struct net_device *wdev; int ret; ASSERT_RTNL(); pr_debug("adding new link\n"); if (!tb[IFLA_LINK]) return -EINVAL; if (params->link_net && !net_eq(params->link_net, dev_net(ldev))) return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) return -ENODEV; if (wdev->type != ARPHRD_IEEE802154) { dev_put(wdev); return -EINVAL; } if (wdev->ieee802154_ptr->lowpan_dev) { dev_put(wdev); return -EBUSY; } lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN * header has LOWPAN_IPHC_MAX_HEADER_LEN more bytes than the IPv6 * header. */ ldev->needed_headroom = LOWPAN_IPHC_MAX_HEADER_LEN + wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; } wdev->ieee802154_ptr->lowpan_dev = ldev; return 0; } static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; lowpan_unregister_netdevice(ldev); dev_put(wdev); } static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, .validate = lowpan_validate, }; static inline int __init lowpan_netlink_init(void) { return rtnl_link_register(&lowpan_link_ops); } static inline void lowpan_netlink_fini(void) { rtnl_link_unregister(&lowpan_link_ops); } static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev; if (ndev->type != ARPHRD_IEEE802154) return NOTIFY_DONE; wpan_dev = ndev->ieee802154_ptr; if (!wpan_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: /* Check if wpan interface is unregistered that we * also delete possible lowpan interfaces which belongs * to the wpan interface. */ if (wpan_dev->lowpan_dev) lowpan_dellink(wpan_dev->lowpan_dev, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block lowpan_dev_notifier = { .notifier_call = lowpan_device_event, }; static int __init lowpan_init_module(void) { int err = 0; err = lowpan_net_frag_init(); if (err < 0) goto out; err = lowpan_netlink_init(); if (err < 0) goto out_frag; err = register_netdevice_notifier(&lowpan_dev_notifier); if (err < 0) goto out_pack; return 0; out_pack: lowpan_netlink_fini(); out_frag: lowpan_net_frag_exit(); out: return err; } static void __exit lowpan_cleanup_module(void) { lowpan_netlink_fini(); lowpan_net_frag_exit(); unregister_netdevice_notifier(&lowpan_dev_notifier); } module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan");
16756 16743 13652 13645 772 772 16 16 1496 16 16 1 16 451 8 279 280 451 449 17 451 451 446 5 451 449 17 451 451 451 449 13 17 16 451 450 451 451 16 450 732 733 731 725 689 50 705 704 8 705 8 22 22 705 704 8 706 706 22 255 696 8 696 702 705 274 275 153 274 1 1 703 700 46 47 703 610 1381 213 1471 1480 671 1472 179 1478 345 345 1476 1006 398 631 1480 984 24 771 768 736 223 222 222 24 24 23 24 767 768 152 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 // SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett */ #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/rhashtable.h> #include <linux/err.h> #include <linux/export.h> #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U union nested_table { union nested_table __rcu *table; struct rhash_lock_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { if (!debug_locks) return 1; if (unlikely(tbl->nest)) return 1; return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) #endif static inline union nested_table *nested_table_top( const struct bucket_table *tbl) { /* The top-level bucket entry does not need RCU protection * because it's set at the same time as tbl->nest. */ return (void *)rcu_dereference_protected(tbl->buckets[0], 1); } static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; ntbl = rcu_dereference_protected(ntbl->table, 1); if (!ntbl) return; if (size > len) { size >>= shift; for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); } kfree(ntbl); } static void nested_bucket_table_free(const struct bucket_table *tbl) { unsigned int size = tbl->size >> tbl->nest; unsigned int len = 1 << tbl->nest; union nested_table *ntbl; unsigned int i; ntbl = nested_table_top(tbl); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); kfree(ntbl); } static void bucket_table_free(const struct bucket_table *tbl) { if (tbl->nest) nested_bucket_table_free(tbl); kvfree(tbl); } static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); } static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, bool leaf) { union nested_table *ntbl; int i; ntbl = rcu_dereference(*prev); if (ntbl) return ntbl; ntbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); if (ntbl && leaf) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) INIT_RHT_NULLS_HEAD(ntbl[i].bucket); } if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) return ntbl; /* Raced with another thread. */ kfree(ntbl); return rcu_dereference(*prev); } static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); struct bucket_table *tbl; size_t size; if (nbuckets < (1 << (shift + 1))) return NULL; size = sizeof(*tbl) + sizeof(tbl->buckets[0]); tbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(size, gfp|__GFP_ZERO)); if (!tbl) return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, false)) { kfree(tbl); return NULL; } tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; return tbl; } static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { struct bucket_table *tbl = NULL; size_t size; int i; static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } if (tbl == NULL) return NULL; lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); tbl->size = size; rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *new_tbl; do { new_tbl = tbl; tbl = rht_dereference_rcu(tbl->future_tbl, ht); } while (tbl); return new_tbl; } static int rhashtable_rehash_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); int err = -EAGAIN; struct rhash_head *head, *next, *entry; struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; unsigned long flags; if (new_tbl->nest) goto out; err = -ENOENT; rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); if (rht_is_a_nulls(next)) break; pprev = &entry->next; } if (err) goto out; new_hash = head_hashfn(ht, new_tbl, entry); flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING); head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); if (pprev) rcu_assign_pointer(*pprev, next); else /* Need to preserved the bit lock. */ rht_assign_locked(bkt, next); out: return err; } static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); unsigned long flags; int err; if (!bkt) return 0; flags = rht_lock(old_tbl, bkt); while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; if (err == -ENOENT) err = 0; rht_unlock(old_tbl, bkt, flags); return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. * As cmpxchg() provides strong barriers, we do not need * rcu_assign_pointer(). */ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, new_tbl) != NULL) return -EEXIST; return 0; } static int rhashtable_rehash_table(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; cond_resched(); } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. * We do this inside the locked region so that * rhashtable_walk_stop() can use rcu_head_after_call_rcu() * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } static int rhashtable_rehash_alloc(struct rhashtable *ht, struct bucket_table *old_tbl, unsigned int size) { struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) bucket_table_free(new_tbl); return err; } /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * * This function shrinks the hash table to fit, i.e., the smallest * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. * * The caller must ensure that no concurrent table mutations take place. * It is however valid to have concurrent lookups if they are RCU protected. * * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; if (old_tbl->size <= size) return 0; if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) err = rhashtable_shrink(ht); else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); if (!err || err == -EEXIST) { int nerr; nerr = rhashtable_rehash_table(ht); err = err ?: nerr; } mutex_unlock(&ht->mutex); if (err) schedule_work(&ht->run_work); } static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); size = tbl->size; err = -EBUSY; if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) goto fail; err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { bucket_table_free(new_tbl); if (err == -EEXIST) err = 0; } else schedule_work(&ht->run_work); return err; fail: /* Do not fail the insert if someone else did a rehash. */ if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) schedule_work(&ht->run_work); return err; } static void *rhashtable_lookup_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; int elasticity; elasticity = RHT_ELASTICITY; rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; elasticity--; if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } if (!ht->rhlist) return rht_obj(ht, head); list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) rcu_assign_pointer(*pprev, obj); else /* Need to preserve the bit lock */ rht_assign_locked(bkt, obj); return NULL; } if (elasticity <= 0) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); } static struct bucket_table *rhashtable_insert_one( struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, void *data) { struct bucket_table *new_tbl; struct rhash_head *head; if (!IS_ERR_OR_NULL(data)) return ERR_PTR(-EEXIST); if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (new_tbl) return new_tbl; if (PTR_ERR(data) != -ENOENT) return ERR_CAST(data); if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } /* bkt is always the head of the list, so it holds * the lock, which we need to preserve */ rht_assign_locked(bkt, obj); return NULL; } static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, struct rhash_head *obj) { struct bucket_table *new_tbl; struct bucket_table *tbl; struct rhash_lock_head __rcu **bkt; unsigned long flags; unsigned int hash; void *data; new_tbl = rcu_dereference(ht->tbl); do { tbl = new_tbl; hash = rht_head_hashfn(ht, tbl, obj, ht->p); if (rcu_access_pointer(tbl->future_tbl)) /* Failure is OK */ bkt = rht_bucket_var(tbl, hash); else bkt = rht_bucket_insert(ht, tbl, hash); if (bkt == NULL) { new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); data = ERR_PTR(-EAGAIN); } else { bool inserted; flags = rht_lock(tbl, bkt); data = rhashtable_lookup_one(ht, bkt, tbl, hash, key, obj); new_tbl = rhashtable_insert_one(ht, bkt, tbl, hash, obj, data); inserted = data && !new_tbl; if (inserted) atomic_inc(&ht->nelems); if (PTR_ERR(new_tbl) != -EEXIST) data = ERR_CAST(new_tbl); rht_unlock(tbl, bkt, flags); if (inserted && rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); } } while (!IS_ERR_OR_NULL(new_tbl)); if (PTR_ERR(data) == -EAGAIN) data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: -EAGAIN); return data; } void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj) { void *data; do { rcu_read_lock(); data = rhashtable_try_insert(ht, key, obj); rcu_read_unlock(); } while (PTR_ERR(data) == -EAGAIN); return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptible context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; iter->end_of_table = 0; spin_lock(&ht->lock); iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator * @iter: Hash table Iterator * * This function frees resources allocated by rhashtable_walk_enter. */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); /** * rhashtable_walk_start_check - Start a hash table walk * @iter: Hash table iterator * * Start a hash table walk at the current iterator position. Note that we take * the RCU lock in all cases including when we return an error. So you must * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may use it immediately * by calling rhashtable_walk_next. * * rhashtable_walk_start is defined as an inline variant that returns * void. This is preferred in cases where the caller would ignore * resize events and always continue. */ int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU) { struct rhashtable *ht = iter->ht; bool rhlist = ht->rhlist; rcu_read_lock(); spin_lock(&ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&ht->lock); if (iter->end_of_table) return 0; if (!iter->walker.tbl) { iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); iter->slot = 0; iter->skip = 0; return -EAGAIN; } if (iter->p && !rhlist) { /* * We need to validate that 'p' is still in the table, and * if so, update 'skip' */ struct rhash_head *p; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { skip++; if (p == iter->p) { iter->skip = skip; goto found; } } iter->p = NULL; } else if (iter->p && rhlist) { /* Need to validate that 'list' is still in the table, and * if so, update 'skip' and 'p'. */ struct rhash_head *p; struct rhlist_head *list; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { for (list = container_of(p, struct rhlist_head, rhead); list; list = rcu_dereference(list->next)) { skip++; if (list == iter->list) { iter->p = p; iter->skip = skip; goto found; } } } iter->p = NULL; } found: return 0; } EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); /** * __rhashtable_walk_find_next - Find the next element in a table (or the first * one in case of a new walk). * * @iter: Hash table iterator * * Returns the found object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. */ static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) { struct bucket_table *tbl = iter->walker.tbl; struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (!tbl) return NULL; for (; iter->slot < tbl->size; iter->slot++) { int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { if (rhlist) { list = container_of(p, struct rhlist_head, rhead); do { if (!skip) goto next; skip--; list = rcu_dereference(list->next); } while (list); continue; } if (!skip) break; skip--; } next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; } iter->p = NULL; /* Ensure we see any new tables. */ smp_rmb(); iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); } else { iter->end_of_table = true; } return NULL; } /** * rhashtable_walk_next - Return the next object and advance the iterator * @iter: Hash table iterator * * Note that you must call rhashtable_walk_stop when you are finished * with the walk. * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (p) { if (!rhlist || !(list = rcu_dereference(list->next))) { p = rcu_dereference(p->next); list = container_of(p, struct rhlist_head, rhead); } if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } /* At the end of this slot, switch to next one and then find * next entry from that point. */ iter->skip = 0; iter->slot++; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_next); /** * rhashtable_walk_peek - Return the next object but don't advance the iterator * @iter: Hash table iterator * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_peek(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; if (p) return rht_obj(ht, ht->rhlist ? &list->rhead : p); /* No object found in current iter, find next one in the table. */ if (iter->skip) { /* A nonzero skip value points to the next entry in the table * beyond that last one that was found. Decrement skip so * we find the current value. __rhashtable_walk_find_next * will restore the original value of skip assuming that * the table hasn't changed. */ iter->skip--; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_peek); /** * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * * Finish a hash table walk. Does not reset the iterator to the start of the * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; ht = iter->ht; spin_lock(&ht->lock); if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) /* This bucket table is being freed, don't re-link it. */ iter->walker.tbl = NULL; else list_add(&iter->walker.list, &tbl->walkers); spin_unlock(&ht->lock); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { size_t retsize; if (params->nelem_hint) retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), (unsigned long)params->min_size); else retsize = max(HASH_DEFAULT_SIZE, (unsigned long)params->min_size); return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) { return jhash2(key, length, seed); } /** * rhashtable_init - initialize a new hash table * @ht: hash table to be initialized * @params: configuration parameters * * Initializes a new hash table based on the provided configuration * parameters. A table can be configured either with a variable or * fixed length key: * * Configuration Example 1: Fixed length keys * struct test_obj { * int key; * void * my_member; * struct rhash_head node; * }; * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, * }; * * Configuration Example 2: Variable length keys * struct test_obj { * [...] * struct rhash_head node; * }; * * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * * return [... hash ...]; * } * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .hashfn = jhash, * .obj_hashfn = my_hash_fn, * }; */ int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); alloc_tag_record(ht->alloc_tag); if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); /* Cap total entries at 2^31 to avoid nelems overflow. */ ht->max_elems = 1u << 31; if (params->max_size) { ht->p.max_size = rounddown_pow_of_two(params->max_size); if (ht->p.max_size < ht->max_elems / 2) ht->max_elems = ht->p.max_size * 2; } ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { ht->p.hashfn = jhash; if (!(ht->key_len & (sizeof(u32) - 1))) { ht->key_len /= sizeof(u32); ht->p.hashfn = rhashtable_jhash2; } } /* * This is api initialization and thus we need to guarantee the * initial rhashtable allocation. Upon failure, retry with the * smallest possible size with __GFP_NOFAIL semantics. */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (unlikely(tbl == NULL)) { size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); } atomic_set(&ht->nelems, 0); RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init_noprof); /** * rhltable_init - initialize a new hash list table * @hlt: hash list table to be initialized * @params: configuration parameters * * Initializes a new hash list table. * * See documentation for rhashtable_init. */ int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) { int err; err = rhashtable_init_noprof(&hlt->ht, params); hlt->ht.rhlist = true; return err; } EXPORT_SYMBOL_GPL(rhltable_init_noprof); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), void *arg) { struct rhlist_head *list; if (!ht->rhlist) { free_fn(rht_obj(ht, obj), arg); return; } list = container_of(obj, struct rhlist_head, rhead); do { obj = &list->rhead; list = rht_dereference(list->next, ht); free_fn(rht_obj(ht, obj), arg); } while (list); } /** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * Stops an eventual async resize. If defined, invokes free_fn for each * element to releasal resources. Please note that RCU protected * readers may still be accessing the elements. Releasing of resources * must occur in a compatible manner. Then frees the bucket array. * * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { struct bucket_table *tbl, *next_tbl; unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); void rhashtable_destroy(struct rhashtable *ht) { return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; unsigned int subhash = hash; union nested_table *ntbl; ntbl = nested_table_top(tbl); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; while (ntbl && size > (1 << shift)) { index = subhash & ((1 << shift) - 1); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); size >>= shift; subhash >>= shift; } if (!ntbl) return NULL; return &ntbl[subhash].bucket; } EXPORT_SYMBOL_GPL(__rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { static struct rhash_lock_head __rcu *rhnull; if (!rhnull) INIT_RHT_NULLS_HEAD(rhnull); return __rht_bucket_nested(tbl, hash) ?: &rhnull; } EXPORT_SYMBOL_GPL(rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; ntbl = nested_table_top(tbl); hash >>= tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); size >>= shift; hash >>= shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); } if (!ntbl) return NULL; return &ntbl[hash].bucket; } EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ELEVATOR_H #define _ELEVATOR_H #include <linux/percpu.h> #include <linux/hashtable.h> #include "blk-mq.h" struct io_cq; struct elevator_type; struct blk_mq_debugfs_attr; /* * Return values from elevator merger */ enum elv_merge { ELEVATOR_NO_MERGE = 0, ELEVATOR_FRONT_MERGE = 1, ELEVATOR_BACK_MERGE = 2, ELEVATOR_DISCARD_MERGE = 3, }; struct blk_mq_alloc_data; struct blk_mq_hw_ctx; struct elevator_tags { /* num. of hardware queues for which tags are allocated */ unsigned int nr_hw_queues; /* depth used while allocating tags */ unsigned int nr_requests; /* shared tag is stored at index 0 */ struct blk_mq_tags *tags[]; }; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*depth_updated)(struct request_queue *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); int (*request_merge)(struct request_queue *q, struct request **, struct bio *); void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); void (*init_icq)(struct io_cq *); void (*exit_icq)(struct io_cq *); }; #define ELV_NAME_MAX (16) struct elv_fs_entry { struct attribute attr; ssize_t (*show)(struct elevator_queue *, char *); ssize_t (*store)(struct elevator_queue *, const char *, size_t); }; /* * identifies an elevator type, such as AS or deadline */ struct elevator_type { /* managed by elevator core */ struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ struct elevator_mq_ops ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; #endif /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ struct list_head list; }; static inline bool elevator_tryget(struct elevator_type *e) { return try_module_get(e->elevator_owner); } static inline void __elevator_get(struct elevator_type *e) { __module_get(e->elevator_owner); } static inline void elevator_put(struct elevator_type *e) { module_put(e->elevator_owner); } #define ELV_HASH_BITS 6 void elv_rqhash_del(struct request_queue *q, struct request *rq); void elv_rqhash_add(struct request_queue *q, struct request *rq); void elv_rqhash_reposition(struct request_queue *q, struct request *rq); struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); /* * each queue has an elevator_queue associated with it */ struct elevator_queue { struct elevator_type *type; struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; #define ELEVATOR_FLAG_REGISTERED 0 #define ELEVATOR_FLAG_DYING 1 #define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface */ extern enum elv_merge elv_merge(struct request_queue *, struct request **, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); void elevator_init_mq(struct request_queue *q); /* * io scheduler registration */ extern int elv_register(struct elevator_type *); extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *, struct elevator_tags *); /* * Helper functions. */ extern struct request *elv_rb_former_request(struct request_queue *, struct request *); extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); /* * rb support functions. */ extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); /* * Insertion selection */ #define ELEVATOR_INSERT_FRONT 1 #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 #define ELEVATOR_INSERT_REQUEUE 4 #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) void blk_mq_sched_reg_debugfs(struct request_queue *q); void blk_mq_sched_unreg_debugfs(struct request_queue *q); #endif /* _ELEVATOR_H */
33 1235 1233 1232 13 65 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); #define NET_SCH_ALIAS_PREFIX "net-sch-" #define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } static inline unsigned int qdisc_peek_len(struct Qdisc *sch) { struct sk_buff *skb; unsigned int len; skb = sch->ops->peek(sch); if (unlikely(skb == NULL)) { qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } len = qdisc_pkt_len(skb); return len; } #endif
6 2 3 3 3 87 86 3 112 4 6 4 6 113 117 4 4 119 112 6 6 1 1 1 1 15 16 14 29 24 15 16 16 29 28 142 14 128 5 136 141 8 154 154 154 27 62 68 94 61 2 2 93 2 136 75 8 130 136 94 42 128 135 92 50 134 149 37 3 3 2 1 1 3 2 1 1 3 3 136 54 146 49 152 7 1 139 54 146 49 160 158 2 2 2 2 2 3 3 3 3 3 3 3 3 3 2 97 29 29 29 2 3 3 3 3 1 2 150 151 66 66 67 67 162 161 1 14 150 150 149 149 162 151 12 162 162 162 162 49 2 126 4 124 2 135 49 153 2 7 7 7 155 151 3 159 2 7 19 144 75 137 9 2 2 118 60 157 2 3 243 243 213 128 128 129 130 153 125 119 34 96 60 50 61 27 17 27 119 118 30 26 3 27 27 32 88 27 27 26 1 1 72 67 15 23 119 119 118 119 4 115 105 29 119 75 75 75 34 36 66 4 4 4 5 5 6 6 6 6 48 48 48 66 4 4 5 6 48 66 58 6 29 33 49 1 46 2 32 16 31 17 43 2 3 44 4 46 2 46 2 44 4 46 2 45 3 44 4 40 5 3 39 5 4 45 3 43 5 42 2 4 42 6 47 3 3 50 49 1 47 48 48 48 48 61 2 60 61 61 1 61 1 1 1 1 1 1 1 4 5 3 11 11 2 2 2 2 2 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* COMMON Applications Kept Enhanced (CAKE) discipline * * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> * * The CAKE Principles: * (or, how to have your cake and eat it too) * * This is a combination of several shaping, AQM and FQ techniques into one * easy-to-use package: * * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), * eliminating the need for any sort of burst parameter (eg. token bucket * depth). Burst support is limited to that necessary to overcome scheduling * latency. * * - A Diffserv-aware priority queue, giving more priority to certain classes, * up to a specified fraction of bandwidth. Above that bandwidth threshold, * the priority is reduced to avoid starving other tins. * * - Each priority tin has a separate Flow Queue system, to isolate traffic * flows from each other. This prevents a burst on one flow from increasing * the delay to another. Flows are distributed to queues using a * set-associative hash function. * * - Each queue is actively managed by Cobalt, which is a combination of the * Codel and Blue AQM algorithms. This serves flows fairly, and signals * congestion early via ECN (if available) and/or packet drops, to keep * latency low. The codel parameters are auto-tuned based on the bandwidth * setting, as is necessary at low bandwidths. * * The configuration parameters are kept deliberately simple for ease of use. * Everything has sane defaults. Complete generality of configuration is *not* * a goal. * * The priority queue operates according to a weighted DRR scheme, combined with * a bandwidth tracker which reuses the shaper logic to detect which side of the * bandwidth sharing threshold the tin is operating. This determines whether a * priority-based weight (high) or a bandwidth-based weight (low) is used for * that tin in the current pass. * * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly * granted us permission to leverage. */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/reciprocal_div.h> #include <net/netlink.h> #include <linux/if_vlan.h> #include <net/gso.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tcp.h> #include <net/flow_dissector.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif #define CAKE_SET_WAYS (8) #define CAKE_MAX_TINS (8) #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate * @target: maximum persistent sojourn time & blue update rate * @mtu_time: serialisation delay of maximum-size packet * @p_inc: increment of blue drop probability (0.32 fxp) * @p_dec: decrement of blue drop probability (0.32 fxp) */ struct cobalt_params { u64 interval; u64 target; u64 mtu_time; u32 p_inc; u32 p_dec; }; /* struct cobalt_vars - contains codel and blue variables * @count: codel dropping frequency * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 * @drop_next: time to drop next packet, or when we dropped last * @blue_timer: Blue time to next drop * @p_drop: BLUE drop probability (0.32 fxp) * @dropping: set if in dropping state * @ecn_marked: set if marked */ struct cobalt_vars { u32 count; u32 rec_inv_sqrt; ktime_t drop_next; ktime_t blue_timer; u32 p_drop; bool dropping; bool ecn_marked; }; enum { CAKE_SET_NONE = 0, CAKE_SET_SPARSE, CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ CAKE_SET_BULK, CAKE_SET_DECAYING }; struct cake_flow { /* this stuff is all needed per-flow at dequeue time */ struct sk_buff *head; struct sk_buff *tail; struct list_head flowchain; s32 deficit; u32 dropped; struct cobalt_vars cvars; u16 srchost; /* index into cake_host table */ u16 dsthost; u8 set; }; /* please try to keep this structure <= 64 bytes */ struct cake_host { u32 srchost_tag; u32 dsthost_tag; u16 srchost_bulk_flow_count; u16 dsthost_bulk_flow_count; }; struct cake_heap_entry { u16 t:3, b:10; }; struct cake_tin_data { struct cake_flow flows[CAKE_QUEUES]; u32 backlogs[CAKE_QUEUES]; u32 tags[CAKE_QUEUES]; /* for set association */ u16 overflow_idx[CAKE_QUEUES]; struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ u16 flow_quantum; struct cobalt_params cparams; u32 drop_overlimit; u16 bulk_flow_count; u16 sparse_flow_count; u16 decaying_flow_count; u16 unresponsive_flow_count; u32 max_skblen; struct list_head new_flows; struct list_head old_flows; struct list_head decaying_flows; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ ktime_t time_next_packet; u64 tin_rate_ns; u64 tin_rate_bps; u16 tin_rate_shft; u16 tin_quantum; s32 tin_deficit; u32 tin_backlog; u32 tin_dropped; u32 tin_ecn_mark; u32 packets; u64 bytes; u32 ack_drops; /* moving averages */ u64 avge_delay; u64 peak_delay; u64 base_delay; /* hash function stats */ u32 way_directs; u32 way_hits; u32 way_misses; u32 way_collisions; }; /* number of tins is small, so size of this struct doesn't matter much */ struct cake_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct tcf_block *block; struct cake_tin_data *tins; struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; u16 overflow_timeout; u16 tin_cnt; u8 tin_mode; u8 flow_mode; u8 ack_filter; u8 atm_mode; u32 fwmark_mask; u16 fwmark_shft; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ u16 rate_shft; ktime_t time_next_packet; ktime_t failsafe_next_packet; u64 rate_ns; u64 rate_bps; u16 rate_flags; s16 rate_overhead; u16 rate_mpu; u64 interval; u64 target; /* resource tracking */ u32 buffer_used; u32 buffer_max_used; u32 buffer_limit; u32 buffer_config_limit; /* indices for dequeue */ u16 cur_tin; u16 cur_flow; struct qdisc_watchdog watchdog; const u8 *tin_index; const u8 *tin_order; /* bandwidth capacity estimate */ ktime_t last_packet_time; ktime_t avg_window_begin; u64 avg_packet_interval; u64 avg_window_bytes; u64 avg_peak_bandwidth; ktime_t last_reconfig_time; /* packet length stats */ u32 avg_netoff; u16 max_netlen; u16 max_adjlen; u16 min_netlen; u16 min_adjlen; }; enum { CAKE_FLAG_OVERHEAD = BIT(0), CAKE_FLAG_AUTORATE_INGRESS = BIT(1), CAKE_FLAG_INGRESS = BIT(2), CAKE_FLAG_WASH = BIT(3), CAKE_FLAG_SPLIT_GSO = BIT(4) }; /* COBALT operates the Codel and BLUE algorithms in parallel, in order to * obtain the best features of each. Codel is excellent on flows which * respond to congestion signals in a TCP-like way. BLUE is more effective on * unresponsive flows. */ struct cobalt_skb_cb { ktime_t enqueue_time; u32 adjusted_len; }; static u64 us_to_ns(u64 us) { return us * NSEC_PER_USEC; } static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; } static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) { return get_cobalt_cb(skb)->enqueue_time; } static void cobalt_set_enqueue_time(struct sk_buff *skb, ktime_t now) { get_cobalt_cb(skb)->enqueue_time = now; } static u16 quantum_div[CAKE_QUEUES + 1] = {0}; /* Diffserv lookup tables */ static const u8 precedence[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, }; static const u8 diffserv8[] = { 2, 0, 1, 2, 4, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 5, 2, 4, 2, 4, 2, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6, 2, 3, 2, 3, 2, 3, 2, 6, 2, 2, 2, 6, 2, 6, 2, 7, 2, 2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 2, }; static const u8 diffserv4[] = { 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 3, 0, 2, 0, 2, 0, 2, 0, 3, 0, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, }; static const u8 diffserv3[] = { 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, }; static const u8 besteffort[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* tin priority order for stats dumping */ static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; static const u8 bulk_order[] = {1, 0, 2, 3}; /* There is a big difference in timing between the accurate values placed in the * cache and the approximations given by a single Newton step for small count * values, particularly when stepping from count 1 to 2 or vice versa. Hence, * these values are calculated using eight Newton steps, using the * implementation below. Above 16, a single Newton step gives sufficient * accuracy in either direction, given the precision stored. * * The magnitude of the error when stepping up to count 2 is such as to give the * value that *should* have been produced at count 4. */ #define REC_INV_SQRT_CACHE (16) static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { ~0, ~0, 3037000500, 2479700525, 2147483647, 1920767767, 1753413056, 1623345051, 1518500250, 1431655765, 1358187914, 1294981364, 1239850263, 1191209601, 1147878294, 1108955788 }; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ static void cobalt_newton_step(struct cobalt_vars *vars) { u32 invsqrt, invsqrt2; u64 val; invsqrt = vars->rec_inv_sqrt; invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; val = (3LL << 32) - ((u64)vars->count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); vars->rec_inv_sqrt = val; } static void cobalt_invsqrt(struct cobalt_vars *vars) { if (vars->count < REC_INV_SQRT_CACHE) vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; else cobalt_newton_step(vars); } static void cobalt_vars_init(struct cobalt_vars *vars) { memset(vars, 0, sizeof(*vars)); } /* CoDel control_law is t + interval/sqrt(count) * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid * both sqrt() and divide operation. */ static ktime_t cobalt_control(ktime_t t, u64 interval, u32 rec_inv_sqrt) { return ktime_add_ns(t, reciprocal_scale(interval, rec_inv_sqrt)); } /* Call this when a packet had to be dropped due to queue overflow. Returns * true if the BLUE state was quiescent before but active after this call. */ static bool cobalt_queue_full(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { bool up = false; if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { up = !vars->p_drop; vars->p_drop += p->p_inc; if (vars->p_drop < p->p_inc) vars->p_drop = ~0; vars->blue_timer = now; } vars->dropping = true; vars->drop_next = now; if (!vars->count) vars->count = 1; return up; } /* Call this when the queue was serviced but turned out to be empty. Returns * true if the BLUE state was active before but quiescent after this call. */ static bool cobalt_queue_empty(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now) { bool down = false; if (vars->p_drop && ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { if (vars->p_drop < p->p_dec) vars->p_drop = 0; else vars->p_drop -= p->p_dec; vars->blue_timer = now; down = !vars->p_drop; } vars->dropping = false; if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); } return down; } /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now, struct sk_buff *skb, u32 bulk_flows) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; bool next_due, over_target; ktime_t schedule; u64 sojourn; /* The 'schedule' variable records, in its sign, whether 'now' is before or * after 'drop_next'. This allows 'drop_next' to be updated before the next * scheduling decision is actually branched, without destroying that * information. Similarly, the first 'schedule' value calculated is preserved * in the boolean 'next_due'. * * As for 'drop_next', we take advantage of the fact that 'interval' is both * the delay between first exceeding 'target' and the first signalling event, * *and* the scaling factor for the signalling frequency. It's therefore very * natural to use a single mechanism for both purposes, and eliminates a * significant amount of reference Codel's spaghetti code. To help with this, * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close * as possible to 1.0 in fixed-point. */ sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); schedule = ktime_sub(now, vars->drop_next); over_target = sojourn > p->target && sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; next_due = vars->count && ktime_to_ns(schedule) >= 0; vars->ecn_marked = false; if (over_target) { if (!vars->dropping) { vars->dropping = true; vars->drop_next = cobalt_control(now, p->interval, vars->rec_inv_sqrt); } if (!vars->count) vars->count = 1; } else if (vars->dropping) { vars->dropping = false; } if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); schedule = ktime_sub(now, vars->drop_next); } else { while (next_due) { vars->count--; cobalt_invsqrt(vars); vars->drop_next = cobalt_control(vars->drop_next, p->interval, vars->rec_inv_sqrt); schedule = ktime_sub(now, vars->drop_next); next_due = vars->count && ktime_to_ns(schedule) >= 0; } } /* Simple BLUE implementation. Lack of ECN is deliberate. */ if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && get_random_u32() < vars->p_drop) reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; bool rev = !skb->_nfct, upd = false; __be32 ip; if (skb_protocol(skb, true) != htons(ETH_P_IP)) return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) return false; ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; if (ip != keys->addrs.v4addrs.src) { keys->addrs.v4addrs.src = ip; upd = true; } ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; if (ip != keys->addrs.v4addrs.dst) { keys->addrs.v4addrs.dst = ip; upd = true; } if (keys->ports.ports) { __be16 port; port = rev ? tuple.dst.u.all : tuple.src.u.all; if (port != keys->ports.src) { keys->ports.src = port; upd = true; } port = rev ? tuple.src.u.all : tuple.dst.u.all; if (port != keys->ports.dst) { port = keys->ports.dst; upd = true; } } return upd; #else return false; #endif } /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ static bool cake_dsrc(int flow_mode) { return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; } static bool cake_ddst(int flow_mode) { return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_dsrc(flow_mode) && q->hosts[flow->srchost].srchost_bulk_flow_count)) q->hosts[flow->srchost].srchost_bulk_flow_count--; } static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_dsrc(flow_mode) && q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) q->hosts[flow->srchost].srchost_bulk_flow_count++; } static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_ddst(flow_mode) && q->hosts[flow->dsthost].dsthost_bulk_flow_count)) q->hosts[flow->dsthost].dsthost_bulk_flow_count--; } static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { if (likely(cake_ddst(flow_mode) && q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) q->hosts[flow->dsthost].dsthost_bulk_flow_count++; } static u16 cake_get_flow_quantum(struct cake_tin_data *q, struct cake_flow *flow, int flow_mode) { u16 host_load = 1; if (cake_dsrc(flow_mode)) host_load = max(host_load, q->hosts[flow->srchost].srchost_bulk_flow_count); if (cake_ddst(flow_mode)) host_load = max(host_load, q->hosts[flow->dsthost].dsthost_bulk_flow_count); /* The get_random_u16() is a way to apply dithering to avoid * accumulating roundoff errors */ return (q->flow_quantum * quantum_div[host_load] + get_random_u16()) >> 16; } static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; /* If both overrides are set, or we can use the SKB hash and nat mode is * disabled, we can skip packet dissection entirely. If nat mode is * enabled there's another check below after doing the conntrack lookup. */ if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); /* Don't use the SKB hash if we change the lookup keys from conntrack */ if (nat_enabled && cake_update_flowkeys(&keys, skb)) use_skbhash = false; /* If we can still use the SKB hash and don't need the host hash, we can * skip the rest of the hashing procedure */ if (use_skbhash && !hash_hosts) goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat * src and dst host addresses as independently selectable. */ host_keys = keys; host_keys.ports.ports = 0; host_keys.basic.ip_proto = 0; host_keys.keyid.keyid = 0; host_keys.tags.flow_label = 0; switch (host_keys.control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: host_keys.addrs.v4addrs.src = 0; dsthost_hash = flow_hash_from_keys(&host_keys); host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; host_keys.addrs.v4addrs.dst = 0; srchost_hash = flow_hash_from_keys(&host_keys); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: memset(&host_keys.addrs.v6addrs.src, 0, sizeof(host_keys.addrs.v6addrs.src)); dsthost_hash = flow_hash_from_keys(&host_keys); host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; memset(&host_keys.addrs.v6addrs.dst, 0, sizeof(host_keys.addrs.v6addrs.dst)); srchost_hash = flow_hash_from_keys(&host_keys); break; default: dsthost_hash = 0; srchost_hash = 0; } /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; } if (!(flow_mode & CAKE_FLOW_FLOWS)) { if (flow_mode & CAKE_FLOW_SRC_IP) flow_hash ^= srchost_hash; if (flow_mode & CAKE_FLOW_DST_IP) flow_hash ^= dsthost_hash; } reduced_hash = flow_hash % CAKE_QUEUES; /* set-associative hashing */ /* fast path if no hash collision (direct lookup succeeds) */ if (likely(q->tags[reduced_hash] == flow_hash && q->flows[reduced_hash].set)) { q->way_directs++; } else { u32 inner_hash = reduced_hash % CAKE_SET_WAYS; u32 outer_hash = reduced_hash - inner_hash; bool allocate_src = false; bool allocate_dst = false; u32 i, k; /* check if any active queue in the set is reserved for * this flow. */ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->tags[outer_hash + k] == flow_hash) { if (i) q->way_hits++; if (!q->flows[outer_hash + k].set) { /* need to increment host refcnts */ allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); } goto found; } } /* no queue is reserved for this flow, look for an * empty one. */ for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->flows[outer_hash + k].set) { q->way_misses++; allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); goto found; } } /* With no empty queues, default to the original * queue, accept the collision, update the host tags. */ q->way_collisions++; allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; q->tags[reduced_hash] = flow_hash; if (allocate_src) { srchost_idx = srchost_hash % CAKE_QUEUES; inner_hash = srchost_idx % CAKE_SET_WAYS; outer_hash = srchost_idx - inner_hash; for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->hosts[outer_hash + k].srchost_tag == srchost_hash) goto found_src; } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) break; } q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; q->flows[reduced_hash].srchost = srchost_idx; if (q->flows[reduced_hash].set == CAKE_SET_BULK) cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { dsthost_idx = dsthost_hash % CAKE_QUEUES; inner_hash = dsthost_idx % CAKE_SET_WAYS; outer_hash = dsthost_idx - inner_hash; for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->hosts[outer_hash + k].dsthost_tag == dsthost_hash) goto found_dst; } for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) break; } q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; q->flows[reduced_hash].dsthost = dsthost_idx; if (q->flows[reduced_hash].set == CAKE_SET_BULK) cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } return reduced_hash; } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from head of slot queue */ static struct sk_buff *dequeue_head(struct cake_flow *flow) { struct sk_buff *skb = flow->head; if (skb) { flow->head = skb->next; skb_mark_not_on_list(skb); } return skb; } /* add skb to flow queue (tail add) */ static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { if (!flow->head) flow->head = skb; else flow->tail->next = skb; flow->tail = skb; skb->next = NULL; } static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, struct ipv6hdr *buf) { unsigned int offset = skb_network_offset(skb); struct iphdr *iph; iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); if (!iph) return NULL; if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) return skb_header_pointer(skb, offset + iph->ihl * 4, sizeof(struct ipv6hdr), buf); else if (iph->version == 4) return iph; else if (iph->version == 6) return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), buf); return NULL; } static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, void *buf, unsigned int bufsize) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct tcphdr *tcph; const struct iphdr *iph; struct ipv6hdr _ipv6h; struct tcphdr _tcph; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return NULL; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return NULL; offset += sizeof(struct ipv6hdr); } else if (iph->protocol != IPPROTO_TCP) { return NULL; } } else if (ipv6h->version == 6) { if (ipv6h->nexthdr != IPPROTO_TCP) return NULL; offset += sizeof(struct ipv6hdr); } else { return NULL; } tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (!tcph || tcph->doff < 5) return NULL; return skb_header_pointer(skb, offset, min(__tcp_hdrlen(tcph), bufsize), buf); } static const void *cake_get_tcpopt(const struct tcphdr *tcph, int code, int *oplen) { /* inspired by tcp_parse_options in tcp_input.c */ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); const u8 *ptr = (const u8 *)(tcph + 1); while (length > 0) { int opcode = *ptr++; int opsize; if (opcode == TCPOPT_EOL) break; if (opcode == TCPOPT_NOP) { length--; continue; } if (length < 2) break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; if (opcode == code) { *oplen = opsize; return ptr; } ptr += opsize - 2; length -= opsize; } return NULL; } /* Compare two SACK sequences. A sequence is considered greater if it SACKs more * bytes than the other. In the case where both sequences ACKs bytes that the * other doesn't, A is considered greater. DSACKs in A also makes A be * considered greater. * * @return -1, 0 or 1 as normal compare functions */ static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, const struct tcphdr *tcph_b) { const struct tcp_sack_block_wire *sack_a, *sack_b; u32 ack_seq_a = ntohl(tcph_a->ack_seq); u32 bytes_a = 0, bytes_b = 0; int oplen_a, oplen_b; bool first = true; sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); /* pointers point to option contents */ oplen_a -= TCPOLEN_SACK_BASE; oplen_b -= TCPOLEN_SACK_BASE; if (sack_a && oplen_a >= sizeof(*sack_a) && (!sack_b || oplen_b < sizeof(*sack_b))) return -1; else if (sack_b && oplen_b >= sizeof(*sack_b) && (!sack_a || oplen_a < sizeof(*sack_a))) return 1; else if ((!sack_a || oplen_a < sizeof(*sack_a)) && (!sack_b || oplen_b < sizeof(*sack_b))) return 0; while (oplen_a >= sizeof(*sack_a)) { const struct tcp_sack_block_wire *sack_tmp = sack_b; u32 start_a = get_unaligned_be32(&sack_a->start_seq); u32 end_a = get_unaligned_be32(&sack_a->end_seq); int oplen_tmp = oplen_b; bool found = false; /* DSACK; always considered greater to prevent dropping */ if (before(start_a, ack_seq_a)) return -1; bytes_a += end_a - start_a; while (oplen_tmp >= sizeof(*sack_tmp)) { u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); /* first time through we count the total size */ if (first) bytes_b += end_b - start_b; if (!after(start_b, start_a) && !before(end_b, end_a)) { found = true; if (!first) break; } oplen_tmp -= sizeof(*sack_tmp); sack_tmp++; } if (!found) return -1; oplen_a -= sizeof(*sack_a); sack_a++; first = false; } /* If we made it this far, all ranges SACKed by A are covered by B, so * either the SACKs are equal, or B SACKs more bytes. */ return bytes_b > bytes_a ? 1 : 0; } static void cake_tcph_get_tstamp(const struct tcphdr *tcph, u32 *tsval, u32 *tsecr) { const u8 *ptr; int opsize; ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); if (ptr && opsize == TCPOLEN_TIMESTAMP) { *tsval = get_unaligned_be32(ptr); *tsecr = get_unaligned_be32(ptr + 4); } } static bool cake_tcph_may_drop(const struct tcphdr *tcph, u32 tstamp_new, u32 tsecr_new) { /* inspired by tcp_parse_options in tcp_input.c */ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); const u8 *ptr = (const u8 *)(tcph + 1); u32 tstamp, tsecr; /* 3 reserved flags must be unset to avoid future breakage * ACK must be set * ECE/CWR are handled separately * All other flags URG/PSH/RST/SYN/FIN must be unset * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) * 0x00C00000 = CWR/ECE (handled separately) * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 */ if (((tcp_flag_word(tcph) & cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) return false; while (length > 0) { int opcode = *ptr++; int opsize; if (opcode == TCPOPT_EOL) break; if (opcode == TCPOPT_NOP) { length--; continue; } if (length < 2) break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; switch (opcode) { case TCPOPT_MD5SIG: /* doesn't influence state */ break; case TCPOPT_SACK: /* stricter checking performed later */ if (opsize % 8 != 2) return false; break; case TCPOPT_TIMESTAMP: /* only drop timestamps lower than new */ if (opsize != TCPOLEN_TIMESTAMP) return false; tstamp = get_unaligned_be32(ptr); tsecr = get_unaligned_be32(ptr + 4); if (after(tstamp, tstamp_new) || after(tsecr, tsecr_new)) return false; break; case TCPOPT_MSS: /* these should only be set on SYN */ case TCPOPT_WINDOW: case TCPOPT_SACK_PERM: case TCPOPT_FASTOPEN: case TCPOPT_EXP: default: /* don't drop if any unknown options are present */ return false; } ptr += opsize - 2; length -= opsize; } return true; } static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, struct cake_flow *flow) { bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; struct sk_buff *skb_check, *skb_prev = NULL; const struct ipv6hdr *ipv6h, *ipv6h_check; unsigned char _tcph[64], _tcph_check[64]; const struct tcphdr *tcph, *tcph_check; const struct iphdr *iph, *iph_check; struct ipv6hdr _iph, _iph_check; const struct sk_buff *skb; int seglen, num_found = 0; u32 tstamp = 0, tsecr = 0; __be32 elig_flags = 0; int sack_comp; /* no other possible ACKs to filter */ if (flow->head == flow->tail) return NULL; skb = flow->tail; tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); iph = cake_get_iphdr(skb, &_iph); if (!tcph) return NULL; cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); /* the 'triggering' packet need only have the ACK flag set. * also check that SYN is not set, as there won't be any previous ACKs. */ if ((tcp_flag_word(tcph) & (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) return NULL; /* the 'triggering' ACK is at the tail of the queue, we have already * returned if it is the only packet in the flow. loop through the rest * of the queue looking for pure ACKs with the same 5-tuple as the * triggering one. */ for (skb_check = flow->head; skb_check && skb_check != skb; skb_prev = skb_check, skb_check = skb_check->next) { iph_check = cake_get_iphdr(skb_check, &_iph_check); tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, sizeof(_tcph_check)); /* only TCP packets with matching 5-tuple are eligible, and only * drop safe headers */ if (!tcph_check || iph->version != iph_check->version || tcph_check->source != tcph->source || tcph_check->dest != tcph->dest) continue; if (iph_check->version == 4) { if (iph_check->saddr != iph->saddr || iph_check->daddr != iph->daddr) continue; seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; ipv6h_check = (struct ipv6hdr *)iph_check; if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) continue; seglen = ntohs(ipv6h_check->payload_len); } else { WARN_ON(1); /* shouldn't happen */ continue; } /* If the ECE/CWR flags changed from the previous eligible * packet in the same flow, we should no longer be dropping that * previous packet as this would lose information. */ if (elig_ack && (tcp_flag_word(tcph_check) & (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { elig_ack = NULL; elig_ack_prev = NULL; num_found--; } /* Check TCP options and flags, don't drop ACKs with segment * data, and don't drop ACKs with a higher cumulative ACK * counter than the triggering packet. Check ACK seqno here to * avoid parsing SACK options of packets we are going to exclude * anyway. */ if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || (seglen - __tcp_hdrlen(tcph_check)) != 0 || after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) continue; /* Check SACK options. The triggering packet must SACK more data * than the ACK under consideration, or SACK the same range but * have a larger cumulative ACK counter. The latter is a * pathological case, but is contained in the following check * anyway, just to be safe. */ sack_comp = cake_tcph_sack_compare(tcph_check, tcph); if (sack_comp < 0 || (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && sack_comp == 0)) continue; /* At this point we have found an eligible pure ACK to drop; if * we are in aggressive mode, we are done. Otherwise, keep * searching unless this is the second eligible ACK we * found. * * Since we want to drop ACK closest to the head of the queue, * save the first eligible ACK we find, even if we need to loop * again. */ if (!elig_ack) { elig_ack = skb_check; elig_ack_prev = skb_prev; elig_flags = (tcp_flag_word(tcph_check) & (TCP_FLAG_ECE | TCP_FLAG_CWR)); } if (num_found++ > 0) goto found; } /* We made it through the queue without finding two eligible ACKs . If * we found a single eligible ACK we can drop it in aggressive mode if * we can guarantee that this does not interfere with ECN flag * information. We ensure this by dropping it only if the enqueued * packet is consecutive with the eligible ACK, and their flags match. */ if (elig_ack && aggressive && elig_ack->next == skb && (elig_flags == (tcp_flag_word(tcph) & (TCP_FLAG_ECE | TCP_FLAG_CWR)))) goto found; return NULL; found: if (elig_ack_prev) elig_ack_prev->next = elig_ack->next; else flow->head = elig_ack->next; skb_mark_not_on_list(elig_ack); return elig_ack; } static u64 cake_ewma(u64 avg, u64 sample, u32 shift) { avg -= avg >> shift; avg += sample >> shift; return avg; } static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) { if (q->rate_flags & CAKE_FLAG_OVERHEAD) len -= off; if (q->max_netlen < len) q->max_netlen = len; if (q->min_netlen > len) q->min_netlen = len; len += q->rate_overhead; if (len < q->rate_mpu) len = q->rate_mpu; if (q->atm_mode == CAKE_ATM_ATM) { len += 47; len /= 48; len *= 53; } else if (q->atm_mode == CAKE_ATM_PTM) { /* Add one byte per 64 bytes or part thereof. * This is conservative and easier to calculate than the * precise value. */ len += (len + 63) / 64; } if (q->max_adjlen < len) q->max_adjlen = len; if (q->min_adjlen > len) q->min_adjlen = len; return len; } static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); u32 len = qdisc_pkt_len(skb); u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); if (!shinfo->gso_size) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) segs = DIV_ROUND_UP(skb->len - hdr_len, shinfo->gso_size); else segs = shinfo->gso_segs; len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); return (cake_calc_overhead(q, len, off) * (segs - 1) + cake_calc_overhead(q, last_len, off)); } static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) { struct cake_heap_entry ii = q->overflow_heap[i]; struct cake_heap_entry jj = q->overflow_heap[j]; q->overflow_heap[i] = jj; q->overflow_heap[j] = ii; q->tins[ii.t].overflow_idx[ii.b] = j; q->tins[jj.t].overflow_idx[jj.b] = i; } static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) { struct cake_heap_entry ii = q->overflow_heap[i]; return q->tins[ii.t].backlogs[ii.b]; } static void cake_heapify(struct cake_sched_data *q, u16 i) { static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; u32 mb = cake_heap_get_backlog(q, i); u32 m = i; while (m < a) { u32 l = m + m + 1; u32 r = l + 1; if (l < a) { u32 lb = cake_heap_get_backlog(q, l); if (lb > mb) { m = l; mb = lb; } } if (r < a) { u32 rb = cake_heap_get_backlog(q, r); if (rb > mb) { m = r; mb = rb; } } if (m != i) { cake_heap_swap(q, i, m); i = m; } else { break; } } } static void cake_heapify_up(struct cake_sched_data *q, u16 i) { while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { u16 p = (i - 1) >> 1; u32 ib = cake_heap_get_backlog(q, i); u32 pb = cake_heap_get_backlog(q, p); if (ib > pb) { cake_heap_swap(q, i, p); i = p; } else { break; } } } static int cake_advance_shaper(struct cake_sched_data *q, struct cake_tin_data *b, struct sk_buff *skb, ktime_t now, bool drop) { u32 len = get_cobalt_cb(skb)->adjusted_len; /* charge packet bandwidth to this tin * and to the global shaper. */ if (q->rate_ns) { u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; u64 global_dur = (len * q->rate_ns) >> q->rate_shft; u64 failsafe_dur = global_dur + (global_dur >> 1); if (ktime_before(b->time_next_packet, now)) b->time_next_packet = ktime_add_ns(b->time_next_packet, tin_dur); else if (ktime_before(b->time_next_packet, ktime_add_ns(now, tin_dur))) b->time_next_packet = ktime_add_ns(now, tin_dur); q->time_next_packet = ktime_add_ns(q->time_next_packet, global_dur); if (!drop) q->failsafe_next_packet = \ ktime_add_ns(q->failsafe_next_packet, failsafe_dur); } return len; } static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct cake_sched_data *q = qdisc_priv(sch); ktime_t now = ktime_get(); u32 idx = 0, tin = 0, len; struct cake_heap_entry qq; struct cake_tin_data *b; struct cake_flow *flow; struct sk_buff *skb; if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; /* select longest queue for pruning */ qq = q->overflow_heap[0]; tin = qq.t; idx = qq.b; b = &q->tins[tin]; flow = &b->flows[idx]; skb = dequeue_head(flow); if (unlikely(!skb)) { /* heap has gone wrong, rebuild it next time */ q->overflow_timeout = 0; return idx + (tin << 16); } if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) b->unresponsive_flow_count++; len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; b->backlogs[idx] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; flow->dropped++; b->tin_dropped++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); return idx + (tin << 16); } static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) { const int offset = skb_network_offset(skb); u16 *buf, buf_; u8 dscp; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) return 0; /* ToS is in the second byte of iphdr */ dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; if (wash && dscp) { const int wlen = offset + sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen) || skb_try_make_writable(skb, wlen)) return 0; ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); } return dscp; case htons(ETH_P_IPV6): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) return 0; /* Traffic class is in the first and second bytes of ipv6hdr */ dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; if (wash && dscp) { const int wlen = offset + sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen) || skb_try_make_writable(skb, wlen)) return 0; ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); } return dscp; case htons(ETH_P_ARP): return 0x38; /* CS7 - Net Control */ default: /* If there is no Diffserv field, treat as best-effort */ return 0; } } static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); u32 tin, mark; bool wash; u8 dscp; /* Tin selection: Default to diffserv-based selection, allow overriding * using firewall marks or skb->priority. Call DSCP parsing early if * wash is enabled, otherwise defer to below to skip unneeded parsing. */ mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; wash = !!(q->rate_flags & CAKE_FLAG_WASH); if (wash) dscp = cake_handle_diffserv(skb, wash); if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; else if (mark && mark <= q->tin_cnt) tin = q->tin_order[mark - 1]; else if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->tin_cnt) tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; else { if (!wash) dscp = cake_handle_diffserv(skb, wash); tin = q->tin_index[dscp]; if (unlikely(tin >= q->tin_cnt)) tin = 0; } return &q->tins[tin]; } static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, struct sk_buff *skb, int flow_mode, int *qerr) { struct cake_sched_data *q = qdisc_priv(sch); struct tcf_proto *filter; struct tcf_result res; u16 flow = 0, host = 0; int result; filter = rcu_dereference_bh(q->filter_list); if (!filter) goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= CAKE_QUEUES) flow = TC_H_MIN(res.classid); if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) host = TC_H_MAJ(res.classid) >> 16; } hash: *t = cake_select_tin(sch, skb); return cake_hash(*t, skb, flow_mode, flow, host) + 1; } static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); int ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; u32 idx, tin; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { if (ktime_before(b->time_next_packet, now)) b->time_next_packet = now; if (!sch->q.qlen) { if (ktime_before(q->time_next_packet, now)) { q->failsafe_next_packet = now; q->time_next_packet = now; } else if (ktime_after(q->time_next_packet, now) && ktime_after(q->failsafe_next_packet, now)) { u64 next = \ min(ktime_to_ns(q->time_next_packet), ktime_to_ns( q->failsafe_next_packet)); sch->qstats.overlimits++; qdisc_watchdog_schedule_ns(&q->watchdog, next); } } } if (unlikely(len > b->max_skblen)) b->max_skblen = len; if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); flow_queue_add(flow, segs); sch->q.qlen++; numsegs++; slen += segs->len; q->buffer_used += segs->truesize; b->packets++; } /* stats */ b->bytes += slen; b->backlogs[idx] += slen; b->tin_backlog += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); if (q->ack_filter) ack = cake_ack_filter(q, flow); if (ack) { b->ack_drops++; sch->qstats.drops++; b->bytes += qdisc_pkt_len(ack); len -= qdisc_pkt_len(ack); q->buffer_used += skb->truesize - ack->truesize; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); consume_skb(ack); } else { sch->q.qlen++; q->buffer_used += skb->truesize; } /* stats */ b->packets++; b->bytes += len; b->backlogs[idx] += len; b->tin_backlog += len; sch->qstats.backlog += len; q->avg_window_bytes += len; } if (q->overflow_timeout) cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { u64 packet_interval = \ ktime_to_ns(ktime_sub(now, q->last_packet_time)); if (packet_interval > NSEC_PER_SEC) packet_interval = NSEC_PER_SEC; /* filter out short-term bursts, eg. wifi aggregation */ q->avg_packet_interval = \ cake_ewma(q->avg_packet_interval, packet_interval, (packet_interval > q->avg_packet_interval ? 2 : 8)); q->last_packet_time = now; if (packet_interval > q->avg_packet_interval) { u64 window_interval = \ ktime_to_ns(ktime_sub(now, q->avg_window_begin)); u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; b = div64_u64(b, window_interval); q->avg_peak_bandwidth = cake_ewma(q->avg_peak_bandwidth, b, b > q->avg_peak_bandwidth ? 2 : 8); q->avg_window_bytes = 0; q->avg_window_begin = now; if (ktime_after(now, ktime_add_ms(q->last_reconfig_time, 250))) { q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; cake_reconfigure(sch); } } } else { q->avg_window_bytes = 0; q->last_packet_time = now; } /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { b->decaying_flow_count--; list_move_tail(&flow->flowchain, &b->new_flows); } flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ flow->set = CAKE_SET_BULK; b->sparse_flow_count--; b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) q->buffer_max_used = q->buffer_used; if (q->buffer_used > q->buffer_limit) { bool same_flow = false; u32 dropped = 0; u32 drop_id; while (q->buffer_used > q->buffer_limit) { dropped++; drop_id = cake_drop(sch, to_free); if ((drop_id >> 16) == tin && (drop_id & 0xFFFF) == idx) same_flow = true; } b->drop_overlimit += dropped; if (same_flow) return NET_XMIT_CN; } return NET_XMIT_SUCCESS; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; struct cake_flow *flow = &b->flows[q->cur_flow]; struct sk_buff *skb = NULL; u32 len; if (flow->head) { skb = dequeue_head(flow); len = qdisc_pkt_len(skb); b->backlogs[q->cur_flow] -= len; b->tin_backlog -= len; sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; sch->q.qlen--; if (q->overflow_timeout) cake_heapify(q, b->overflow_idx[q->cur_flow]); } return skb; } /* Discard leftover packets from a tin no longer in use. */ static void cake_clear_tin(struct Qdisc *sch, u16 tin) { struct cake_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; u64 delay; u32 len; begin: if (!sch->q.qlen) return NULL; /* global hard shaper */ if (ktime_after(q->time_next_packet, now) && ktime_after(q->failsafe_next_packet, now)) { u64 next = min(ktime_to_ns(q->time_next_packet), ktime_to_ns(q->failsafe_next_packet)); sch->qstats.overlimits++; qdisc_watchdog_schedule_ns(&q->watchdog, next); return NULL; } /* Choose a class to work on. */ if (!q->rate_ns) { /* In unlimited mode, can't rely on shaper timings, just balance * with DRR */ bool wrapped = false, empty = true; while (b->tin_deficit < 0 || !(b->sparse_flow_count + b->bulk_flow_count)) { if (b->tin_deficit <= 0) b->tin_deficit += b->tin_quantum; if (b->sparse_flow_count + b->bulk_flow_count) empty = false; q->cur_tin++; b++; if (q->cur_tin >= q->tin_cnt) { q->cur_tin = 0; b = q->tins; if (wrapped) { /* It's possible for q->qlen to be * nonzero when we actually have no * packets anywhere. */ if (empty) return NULL; } else { wrapped = true; } } } } else { /* In shaped mode, choose: * - Highest-priority tin with queue and meeting schedule, or * - The earliest-scheduled tin with queue. */ ktime_t best_time = KTIME_MAX; int tin, best_tin = 0; for (tin = 0; tin < q->tin_cnt; tin++) { b = q->tins + tin; if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { ktime_t time_to_pkt = \ ktime_sub(b->time_next_packet, now); if (ktime_to_ns(time_to_pkt) <= 0 || ktime_compare(time_to_pkt, best_time) <= 0) { best_time = time_to_pkt; best_tin = tin; } } } q->cur_tin = best_tin; b = q->tins + best_tin; /* No point in going further if no packets to deliver. */ if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) return NULL; } retry: /* service this class */ head = &b->decaying_flows; if (!first_flow || list_empty(head)) { head = &b->new_flows; if (list_empty(head)) { head = &b->old_flows; if (unlikely(list_empty(head))) { head = &b->decaying_flows; if (unlikely(list_empty(head))) goto begin; } } } flow = list_first_entry(head, struct cake_flow, flowchain); q->cur_flow = flow - b->flows; first_flow = false; /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying * rotations. No non-empty flow can go into the decaying * rotation, so they can't get deficits */ if (flow->set == CAKE_SET_SPARSE) { if (flow->head) { b->sparse_flow_count--; b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { /* we've moved it to the bulk rotation for * correct deficit accounting but we still want * to count it as a sparse flow, not a bulk one. */ flow->set = CAKE_SET_SPARSE_WAIT; } } flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; } /* Retrieve a packet via the AQM */ while (1) { skb = cake_dequeue_one(sch); if (!skb) { /* this queue was actually empty */ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) b->unresponsive_flow_count--; if (flow->cvars.p_drop || flow->cvars.count || ktime_before(now, flow->cvars.drop_next)) { /* keep in the flowchain until the state has * decayed to rest */ list_move_tail(&flow->flowchain, &b->decaying_flows); if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) { b->sparse_flow_count--; b->decaying_flow_count++; } flow->set = CAKE_SET_DECAYING; } else { /* remove empty queue from the flowchain */ list_del_init(&flow->flowchain); if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) b->sparse_flow_count--; else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; flow->set = CAKE_SET_NONE; } goto begin; } reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, (b->bulk_flow_count * !!(q->rate_flags & CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ if (q->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); flow->deficit -= len; b->tin_deficit -= len; } flow->dropped++; b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; qdisc_bstats_update(sch, skb); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); b->avge_delay = cake_ewma(b->avge_delay, delay, 8); b->peak_delay = cake_ewma(b->peak_delay, delay, delay > b->peak_delay ? 2 : 8); b->base_delay = cake_ewma(b->base_delay, delay, delay < b->base_delay ? 2 : 8); len = cake_advance_shaper(q, b, skb, now, false); flow->deficit -= len; b->tin_deficit -= len; if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { u64 next = min(ktime_to_ns(q->time_next_packet), ktime_to_ns(q->failsafe_next_packet)); qdisc_watchdog_schedule_ns(&q->watchdog, next); } else if (!sch->q.qlen) { int i; for (i = 0; i < q->tin_cnt; i++) { if (q->tins[i].decaying_flow_count) { ktime_t next = \ ktime_add_ns(now, q->tins[i].cparams.target); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); break; } } } if (q->overflow_timeout) q->overflow_timeout--; return skb; } static void cake_reset(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); u32 c; if (!q->tins) return; for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, [TCA_CAKE_ATM] = { .type = NLA_U32 }, [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, [TCA_CAKE_RTT] = { .type = NLA_U32 }, [TCA_CAKE_TARGET] = { .type = NLA_U32 }, [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, [TCA_CAKE_NAT] = { .type = NLA_U32 }, [TCA_CAKE_RAW] = { .type = NLA_U32 }, [TCA_CAKE_WASH] = { .type = NLA_U32 }, [TCA_CAKE_MPU] = { .type = NLA_U32 }, [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, }; static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, u64 target_ns, u64 rtt_est_ns) { /* convert byte-rate into time-per-byte * so it will always unwedge in reasonable time. */ static const u64 MIN_RATE = 64; u32 byte_target = mtu; u64 byte_target_ns; u8 rate_shft = 0; u64 rate_ns = 0; b->flow_quantum = 1514; if (rate) { b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); rate_shft = 34; rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); while (!!(rate_ns >> 34)) { rate_ns >>= 1; rate_shft--; } } /* else unlimited, ie. zero delay */ b->tin_rate_bps = rate; b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; byte_target_ns = (byte_target * rate_ns) >> rate_shft; b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); b->cparams.interval = max(rtt_est_ns + b->cparams.target - target_ns, b->cparams.target * 2); b->cparams.mtu_time = byte_target_ns; b->cparams.p_inc = 1 << 24; /* 1/256 */ b->cparams.p_dec = 1 << 20; /* 1/4096 */ } static int cake_config_besteffort(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; q->tin_cnt = 1; q->tin_index = besteffort; q->tin_order = normal_order; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = 65535; return 0; } static int cake_config_precedence(struct Qdisc *sch) { /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 256; u32 i; q->tin_cnt = 8; q->tin_index = precedence; q->tin_order = normal_order; for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; quantum *= 7; quantum >>= 3; } return 0; } /* List of known Diffserv codepoints: * * Default Forwarding (DF/CS0) - Best Effort * Max Throughput (TOS2) * Min Delay (TOS4) * LLT "La" (TOS5) * Assured Forwarding 1 (AF1x) - x3 * Assured Forwarding 2 (AF2x) - x3 * Assured Forwarding 3 (AF3x) - x3 * Assured Forwarding 4 (AF4x) - x3 * Precedence Class 1 (CS1) * Precedence Class 2 (CS2) * Precedence Class 3 (CS3) * Precedence Class 4 (CS4) * Precedence Class 5 (CS5) * Precedence Class 6 (CS6) * Precedence Class 7 (CS7) * Voice Admit (VA) * Expedited Forwarding (EF) * Lower Effort (LE) * * Total 26 codepoints. */ /* List of traffic classes in RFC 4594, updated by RFC 8622: * (roughly descending order of contended priority) * (roughly ascending order of uncontended throughput) * * Network Control (CS6,CS7) - routing traffic * Telephony (EF,VA) - aka. VoIP streams * Signalling (CS5) - VoIP setup * Multimedia Conferencing (AF4x) - aka. video calls * Realtime Interactive (CS4) - eg. games * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch * Broadcast Video (CS3) * Low-Latency Data (AF2x,TOS4) - eg. database * Ops, Admin, Management (CS2) - eg. ssh * Standard Service (DF & unrecognised codepoints) * High-Throughput Data (AF1x,TOS2) - eg. web traffic * Low-Priority Data (LE,CS1) - eg. BitTorrent * * Total 12 traffic classes. */ static int cake_config_diffserv8(struct Qdisc *sch) { /* Pruned list of traffic classes for typical applications: * * Network Control (CS6, CS7) * Minimum Latency (EF, VA, CS5, CS4) * Interactive Shell (CS2) * Low Latency Transactions (AF2x, TOS4) * Video Streaming (AF4x, AF3x, CS3) * Bog Standard (DF etc.) * High Throughput (AF1x, TOS2, CS1) * Background Traffic (LE) * * Total 8 traffic classes. */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 256; u32 i; q->tin_cnt = 8; /* codepoint to class mapping */ q->tin_index = diffserv8; q->tin_order = normal_order; /* class characteristics */ for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum = max_t(u16, 1U, quantum); /* calculate next class's parameters */ rate *= 7; rate >>= 3; quantum *= 7; quantum >>= 3; } return 0; } static int cake_config_diffserv4(struct Qdisc *sch) { /* Further pruned list of traffic classes for four-class system: * * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) * Best Effort (DF, AF1x, TOS2, and those not specified) * Background Traffic (LE, CS1) * * Total 4 traffic classes. */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; /* codepoint to class mapping */ q->tin_index = diffserv4; q->tin_order = bulk_order; /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[2], rate >> 1, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[3], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; q->tins[1].tin_quantum = quantum >> 4; q->tins[2].tin_quantum = quantum >> 1; q->tins[3].tin_quantum = quantum >> 2; return 0; } static int cake_config_diffserv3(struct Qdisc *sch) { /* Simplified Diffserv structure with 3 tins. * Latency Sensitive (CS7, CS6, EF, VA, TOS4) * Best Effort * Low Priority (LE, CS1) */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); u64 rate = q->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; /* codepoint to class mapping */ q->tin_index = diffserv3; q->tin_order = bulk_order; /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, us_to_ns(q->target), us_to_ns(q->interval)); cake_set_rate(&q->tins[2], rate >> 2, mtu, us_to_ns(q->target), us_to_ns(q->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; q->tins[1].tin_quantum = quantum >> 4; q->tins[2].tin_quantum = quantum >> 2; return 0; } static void cake_reconfigure(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); int c, ft; switch (q->tin_mode) { case CAKE_DIFFSERV_BESTEFFORT: ft = cake_config_besteffort(sch); break; case CAKE_DIFFSERV_PRECEDENCE: ft = cake_config_precedence(sch); break; case CAKE_DIFFSERV_DIFFSERV8: ft = cake_config_diffserv8(sch); break; case CAKE_DIFFSERV_DIFFSERV4: ft = cake_config_diffserv4(sch); break; case CAKE_DIFFSERV_DIFFSERV3: default: ft = cake_config_diffserv3(sch); break; } for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; } q->rate_ns = q->tins[ft].tin_rate_ns; q->rate_shft = q->tins[ft].tin_rate_shft; if (q->buffer_config_limit) { q->buffer_limit = q->buffer_config_limit; } else if (q->rate_bps) { u64 t = q->rate_bps * q->interval; do_div(t, USEC_PER_SEC / 4); q->buffer_limit = max_t(u32, t, 4U << 20); } else { q->buffer_limit = ~0; } sch->flags &= ~TCQ_F_CAN_BYPASS; q->buffer_limit = min(q->buffer_limit, max(sch->limit * psched_mtu(qdisc_dev(sch)), q->buffer_config_limit)); } static int cake_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; u16 rate_flags; u8 flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, extack); if (err < 0) return err; flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) flow_mode &= ~CAKE_FLOW_NAT_FLAG; flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], "No conntrack support in kernel"); return -EOPNOTSUPP; #endif } if (tb[TCA_CAKE_BASE_RATE64]) WRITE_ONCE(q->rate_bps, nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) WRITE_ONCE(q->tin_mode, nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) rate_flags |= CAKE_FLAG_WASH; else rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) WRITE_ONCE(q->atm_mode, nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { WRITE_ONCE(q->rate_overhead, nla_get_s32(tb[TCA_CAKE_OVERHEAD])); rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; q->min_netlen = ~0; q->min_adjlen = ~0; } if (tb[TCA_CAKE_RAW]) { rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; q->min_netlen = ~0; q->min_adjlen = ~0; } if (tb[TCA_CAKE_MPU]) WRITE_ONCE(q->rate_mpu, nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) rate_flags |= CAKE_FLAG_INGRESS; else rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) WRITE_ONCE(q->ack_filter, nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) WRITE_ONCE(q->buffer_config_limit, nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) rate_flags |= CAKE_FLAG_SPLIT_GSO; else rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } if (tb[TCA_CAKE_FWMARK]) { WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); WRITE_ONCE(q->fwmark_shft, q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } WRITE_ONCE(q->rate_flags, rate_flags); WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); sch_tree_unlock(sch); } return 0; } static void cake_destroy(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); tcf_block_put(q->block); kvfree(q->tins); } static int cake_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); int i, j, err; sch->limit = 10240; q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; q->rate_bps = 0; /* unlimited by default */ q->interval = 100000; /* 100ms default */ q->target = 5000; /* 5ms: codel RFC argues * for 5 to 10% of interval */ q->rate_flags |= CAKE_FLAG_SPLIT_GSO; q->cur_tin = 0; q->cur_flow = 0; qdisc_watchdog_init(&q->watchdog, sch); if (opt) { err = cake_change(sch, opt, extack); if (err) return err; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; quantum_div[0] = ~0; for (i = 1; i <= CAKE_QUEUES; i++) quantum_div[i] = 65535 / i; q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), GFP_KERNEL); if (!q->tins) return -ENOMEM; for (i = 0; i < CAKE_MAX_TINS; i++) { struct cake_tin_data *b = q->tins + i; INIT_LIST_HEAD(&b->new_flows); INIT_LIST_HEAD(&b->old_flows); INIT_LIST_HEAD(&b->decaying_flows); b->sparse_flow_count = 0; b->bulk_flow_count = 0; b->decaying_flow_count = 0; for (j = 0; j < CAKE_QUEUES; j++) { struct cake_flow *flow = b->flows + j; u32 k = j * CAKE_MAX_TINS + i; INIT_LIST_HEAD(&flow->flowchain); cobalt_vars_init(&flow->cvars); q->overflow_heap[k].t = i; q->overflow_heap[k].b = j; b->overflow_idx[j] = k; } } cake_reconfigure(sch); q->avg_peak_bandwidth = q->rate_bps; q->min_netlen = ~0; q->min_adjlen = ~0; return 0; } static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; u16 rate_flags; u8 flow_mode; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; flow_mode = READ_ONCE(q->flow_mode); if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_MEMORY, READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tstats, *ts; int i; if (!stats) return -1; #define PUT_STAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_STAT_U64(attr, data) do { \ if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ data, TCA_CAKE_STATS_PAD)) \ goto nla_put_failure; \ } while (0) PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); PUT_STAT_U32(MAX_NETLEN, q->max_netlen); PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); PUT_STAT_U32(MIN_NETLEN, q->min_netlen); PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); #undef PUT_STAT_U32 #undef PUT_STAT_U64 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); if (!tstats) goto nla_put_failure; #define PUT_TSTAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_TSTAT_U64(attr, data) do { \ if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ data, TCA_CAKE_TIN_STATS_PAD)) \ goto nla_put_failure; \ } while (0) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; ts = nla_nest_start_noflag(d->skb, i + 1); if (!ts) goto nla_put_failure; PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); PUT_TSTAT_U64(SENT_BYTES64, b->bytes); PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); PUT_TSTAT_U32(TARGET_US, ktime_to_us(ns_to_ktime(b->cparams.target))); PUT_TSTAT_U32(INTERVAL_US, ktime_to_us(ns_to_ktime(b->cparams.interval))); PUT_TSTAT_U32(SENT_PACKETS, b->packets); PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); PUT_TSTAT_U32(PEAK_DELAY_US, ktime_to_us(ns_to_ktime(b->peak_delay))); PUT_TSTAT_U32(AVG_DELAY_US, ktime_to_us(ns_to_ktime(b->avge_delay))); PUT_TSTAT_U32(BASE_DELAY_US, ktime_to_us(ns_to_ktime(b->base_delay))); PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); PUT_TSTAT_U32(WAY_MISSES, b->way_misses); PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + b->decaying_flow_count); PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); nla_nest_end(d->skb, ts); } #undef PUT_TSTAT_U32 #undef PUT_TSTAT_U64 nla_nest_end(d->skb, tstats); return nla_nest_end(d->skb, stats); nla_put_failure: nla_nest_cancel(d->skb, stats); return -1; } static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long cake_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void cake_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct cake_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int cake_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct cake_sched_data *q = qdisc_priv(sch); const struct cake_flow *flow = NULL; struct gnet_stats_queue qs = { 0 }; struct nlattr *stats; u32 idx = cl - 1; if (idx < CAKE_QUEUES * q->tin_cnt) { const struct cake_tin_data *b = \ &q->tins[q->tin_order[idx / CAKE_QUEUES]]; const struct sk_buff *skb; flow = &b->flows[idx % CAKE_QUEUES]; if (flow->head) { sch_tree_lock(sch); skb = flow->head; while (skb) { qs.qlen++; skb = skb->next; } sch_tree_unlock(sch); } qs.backlog = b->backlogs[idx % CAKE_QUEUES]; qs.drops = flow->dropped; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (flow) { ktime_t now = ktime_get(); stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) return -1; #define PUT_STAT_U32(attr, data) do { \ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) #define PUT_STAT_S32(attr, data) do { \ if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ goto nla_put_failure; \ } while (0) PUT_STAT_S32(DEFICIT, flow->deficit); PUT_STAT_U32(DROPPING, flow->cvars.dropping); PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); PUT_STAT_U32(P_DROP, flow->cvars.p_drop); if (flow->cvars.p_drop) { PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, flow->cvars.blue_timer))); } if (flow->cvars.dropping) { PUT_STAT_S32(DROP_NEXT_US, ktime_to_us( ktime_sub(now, flow->cvars.drop_next))); } if (nla_nest_end(d->skb, stats) < 0) return -1; } return 0; nla_put_failure: nla_nest_cancel(d->skb, stats); return -1; } static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cake_sched_data *q = qdisc_priv(sch); unsigned int i, j; if (arg->stop) return; for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { if (list_empty(&b->flows[j].flowchain)) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, arg)) break; } } } static const struct Qdisc_class_ops cake_class_ops = { .leaf = cake_leaf, .find = cake_find, .tcf_block = cake_tcf_block, .bind_tcf = cake_bind, .unbind_tcf = cake_unbind, .dump = cake_dump_class, .dump_stats = cake_dump_class_stats, .walk = cake_walk, }; static struct Qdisc_ops cake_qdisc_ops __read_mostly = { .cl_ops = &cake_class_ops, .id = "cake", .priv_size = sizeof(struct cake_sched_data), .enqueue = cake_enqueue, .dequeue = cake_dequeue, .peek = qdisc_peek_dequeued, .init = cake_init, .reset = cake_reset, .destroy = cake_destroy, .change = cake_change, .dump = cake_dump, .dump_stats = cake_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("cake"); static int __init cake_module_init(void) { return register_qdisc(&cake_qdisc_ops); } static void __exit cake_module_exit(void) { unregister_qdisc(&cake_qdisc_ops); } module_init(cake_module_init) module_exit(cake_module_exit) MODULE_AUTHOR("Jonathan Morton"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("The CAKE shaper.");
3 19 23 4 19 5 2 3 148 148 148 147 148 52 148 53 94 95 52 53 52 148 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/perf_event.h> #include <linux/slab.h> #include <linux/sched/task_stack.h> #include <linux/uprobes.h> #include "internal.h" struct callchain_cpus_entries { struct rcu_head rcu_head; struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + sizeof(__u64) * (sysctl_perf_event_max_stack + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; __weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } __weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } static void release_callchain_buffers_rcu(struct rcu_head *head) { struct callchain_cpus_entries *entries; int cpu; entries = container_of(head, struct callchain_cpus_entries, rcu_head); for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); } static void release_callchain_buffers(void) { struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } static int alloc_callchain_buffers(void) { int cpu; int size; struct callchain_cpus_entries *entries; /* * We can't use the percpu allocation API for data that can be * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) return -ENOMEM; size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!entries->cpu_entries[cpu]) goto fail; } rcu_assign_pointer(callchain_cpus_entries, entries); return 0; fail: for_each_possible_cpu(cpu) kfree(entries->cpu_entries[cpu]); kfree(entries); return -ENOMEM; } int get_callchain_buffers(int event_max_stack) { int err = 0; int count; mutex_lock(&callchain_mutex); count = atomic_inc_return(&nr_callchain_events); if (WARN_ON_ONCE(count < 1)) { err = -EINVAL; goto exit; } /* * If requesting per event more than the global cap, * return a different error to help userspace figure * this out. * * And also do it here so that we have &callchain_mutex held. */ if (event_max_stack > sysctl_perf_event_max_stack) { err = -EOVERFLOW; goto exit; } if (count == 1) err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); mutex_unlock(&callchain_mutex); return err; } void put_callchain_buffers(void) { if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { release_callchain_buffers(); mutex_unlock(&callchain_mutex); } } struct perf_callchain_entry *get_callchain_entry(int *rctx) { int cpu; struct callchain_cpus_entries *entries; *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; entries = rcu_dereference(callchain_cpus_entries); if (!entries) { put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx); return NULL; } cpu = smp_processor_id(); return (((void *)entries->cpu_entries[cpu]) + (*rctx * perf_callchain_entry__sizeof())); } void put_callchain_entry(int rctx) { put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry, int start_entry_idx) { #ifdef CONFIG_UPROBES struct uprobe_task *utask = current->utask; struct return_instance *ri; __u64 *cur_ip, *last_ip, tramp_addr; if (likely(!utask || !utask->return_instances)) return; cur_ip = &entry->ip[start_entry_idx]; last_ip = &entry->ip[entry->nr - 1]; ri = utask->return_instances; tramp_addr = uprobe_get_trampoline_vaddr(); /* * If there are pending uretprobes for the current thread, they are * recorded in a list inside utask->return_instances; each such * pending uretprobe replaces traced user function's return address on * the stack, so when stack trace is captured, instead of seeing * actual function's return address, we'll have one or many uretprobe * trampoline addresses in the stack trace, which are not helpful and * misleading to users. * So here we go over the pending list of uretprobes, and each * encountered trampoline address is replaced with actual return * address. */ while (ri && cur_ip <= last_ip) { if (*cur_ip == tramp_addr) { *cur_ip = ri->orig_ret_vaddr; ri = ri->next; } cur_ip++; } #endif } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; int rctx, start_entry_idx; /* crosstask is not supported for user stacks */ if (crosstask && user && !kernel) return NULL; entry = get_callchain_entry(&rctx); if (!entry) return NULL; ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = 0; ctx.contexts = 0; ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } if (user && !crosstask) { if (!user_mode(regs)) { if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) goto exit_put; regs = task_pt_regs(current); } if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); start_entry_idx = entry->nr; perf_callchain_user(&ctx, regs); fixup_uretprobe_trampoline_entries(entry, start_entry_idx); } exit_put: put_callchain_entry(rctx); return entry; } static int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); if (ret || !write) return ret; mutex_lock(&callchain_mutex); if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else *value = new_value; mutex_unlock(&callchain_mutex); return ret; } static const struct ctl_table callchain_sysctl_table[] = { { .procname = "perf_event_max_stack", .data = &sysctl_perf_event_max_stack, .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", .data = &sysctl_perf_event_max_contexts_per_stack, .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_THOUSAND, }, }; static int __init init_callchain_sysctls(void) { register_sysctl_init("kernel", callchain_sysctl_table); return 0; } core_initcall(init_callchain_sysctls);
3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0-only #include <linux/phy.h> #include <linux/ethtool_netlink.h> #include "netlink.h" #include "common.h" struct plca_req_info { struct ethnl_req_info base; }; struct plca_reply_data { struct ethnl_reply_data base; struct phy_plca_cfg plca_cfg; struct phy_plca_status plca_st; }; // Helpers ------------------------------------------------------------------ // #define PLCA_REPDATA(__reply_base) \ container_of(__reply_base, struct plca_reply_data, base) // PLCA get configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_get_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid, bool *mod) { const struct nlattr *attr = tb[attrid]; if (!attr || WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy))) return; switch (ethnl_plca_set_cfg_policy[attrid].type) { case NLA_U8: *dst = nla_get_u8(attr); break; case NLA_U32: *dst = nla_get_u32(attr); break; default: WARN_ON_ONCE(1); } *mod = true; } static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } // note: rtnl_lock is held already by ethnl_default_doit ops = ethtool_phy_ops; if (!ops || !ops->get_plca_cfg) { ret = -EOPNOTSUPP; goto out; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out; memset(&data->plca_cfg, 0xff, sizeof_field(struct plca_reply_data, plca_cfg)); ret = ops->get_plca_cfg(phydev, &data->plca_cfg); ethnl_ops_complete(dev); out: return ret; } static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u16)) + /* _VERSION */ nla_total_size(sizeof(u8)) + /* _ENABLED */ nla_total_size(sizeof(u32)) + /* _NODE_CNT */ nla_total_size(sizeof(u32)) + /* _NODE_ID */ nla_total_size(sizeof(u32)) + /* _TO_TIMER */ nla_total_size(sizeof(u32)) + /* _BURST_COUNT */ nla_total_size(sizeof(u32)); /* _BURST_TIMER */ } static int plca_get_cfg_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct plca_reply_data *data = PLCA_REPDATA(reply_base); const struct phy_plca_cfg *plca = &data->plca_cfg; if ((plca->version >= 0 && nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) || (plca->enabled >= 0 && nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) || (plca->node_id >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) || (plca->node_cnt >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) || (plca->to_tmr >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) || (plca->burst_cnt >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) || (plca->burst_tmr >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr))) return -EMSGSIZE; return 0; }; // PLCA set configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_set_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255), }; static int ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_plca_cfg plca_cfg; struct phy_device *phydev; bool mod = false; int ret; phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) return -EOPNOTSUPP; ops = ethtool_phy_ops; if (!ops || !ops->set_plca_cfg) return -EOPNOTSUPP; memset(&plca_cfg, 0xff, sizeof(plca_cfg)); plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod); plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod); plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod); plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod); plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT, &mod); plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR, &mod); if (!mod) return 0; ret = ops->set_plca_cfg(phydev, &plca_cfg, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG, .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY, .hdr_attr = ETHTOOL_A_PLCA_HEADER, .req_info_size = sizeof(struct plca_req_info), .reply_data_size = sizeof(struct plca_reply_data), .prepare_data = plca_get_cfg_prepare_data, .reply_size = plca_get_cfg_reply_size, .fill_reply = plca_get_cfg_fill_reply, .set = ethnl_set_plca, .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF, }; // PLCA get status message -------------------------------------------------- // const struct nla_policy ethnl_plca_get_status_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } // note: rtnl_lock is held already by ethnl_default_doit ops = ethtool_phy_ops; if (!ops || !ops->get_plca_status) { ret = -EOPNOTSUPP; goto out; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out; memset(&data->plca_st, 0xff, sizeof_field(struct plca_reply_data, plca_st)); ret = ops->get_plca_status(phydev, &data->plca_st); ethnl_ops_complete(dev); out: return ret; } static int plca_get_status_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)); /* _STATUS */ } static int plca_get_status_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct plca_reply_data *data = PLCA_REPDATA(reply_base); const u8 status = data->plca_st.pst; if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status)) return -EMSGSIZE; return 0; }; const struct ethnl_request_ops ethnl_plca_status_request_ops = { .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, .hdr_attr = ETHTOOL_A_PLCA_HEADER, .req_info_size = sizeof(struct plca_req_info), .reply_data_size = sizeof(struct plca_reply_data), .prepare_data = plca_get_status_prepare_data, .reply_size = plca_get_status_reply_size, .fill_reply = plca_get_status_fill_reply, };
10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> * * Automatically generate and verify integrity data on PI capable devices if the * bio submitter didn't provide PI itself. This ensures that kernel verifies * data integrity even if the file system (or other user of the block device) is * not aware of PI. */ #include <linux/blk-integrity.h> #include <linux/t10-pi.h> #include <linux/workqueue.h> #include "blk.h" struct bio_integrity_data { struct bio *bio; struct bvec_iter saved_bio_iter; struct work_struct work; struct bio_integrity_payload bip; struct bio_vec bvec; }; static struct kmem_cache *bid_slab; static mempool_t bid_pool; static struct workqueue_struct *kintegrityd_wq; static void bio_integrity_finish(struct bio_integrity_data *bid) { bid->bio->bi_integrity = NULL; bid->bio->bi_opf &= ~REQ_INTEGRITY; kfree(bvec_virt(bid->bip.bip_vec)); mempool_free(bid, &bid_pool); } static void bio_integrity_verify_fn(struct work_struct *work) { struct bio_integrity_data *bid = container_of(work, struct bio_integrity_data, work); struct bio *bio = bid->bio; blk_integrity_verify_iter(bio, &bid->saved_bio_iter); bio_integrity_finish(bid); bio_endio(bio); } #define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) static bool bip_should_check(struct bio_integrity_payload *bip) { return bip->bip_flags & BIP_CHECK_FLAGS; } static bool bi_offload_capable(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_CRC64: return bi->metadata_size == sizeof(struct crc64_pi_tuple); case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: return bi->metadata_size == sizeof(struct t10_pi_tuple); default: pr_warn_once("%s: unknown integrity checksum type:%d\n", __func__, bi->csum_type); fallthrough; case BLK_INTEGRITY_CSUM_NONE: return false; } } /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio * * Normally I/O completion is done in interrupt context. However, verifying I/O * integrity is a time-consuming task which must be run in process context. * * This function postpones completion accordingly. */ bool __bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_data *bid = container_of(bip, struct bio_integrity_data, bip); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bip_should_check(bip)) { INIT_WORK(&bid->work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bid->work); return false; } bio_integrity_finish(bid); return true; } /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * * Checks if the bio already has an integrity payload attached. If it does, the * payload has been generated by another kernel subsystem, and we just pass it * through. * Otherwise allocates integrity payload and for writes the integrity metadata * will be generated. For reads, the completion handler will verify the * metadata. */ bool bio_integrity_prep(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; bool set_flags = true; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; if (!bi) return true; if (!bio_sectors(bio)) return true; /* Already protected? */ if (bio_integrity(bio)) return true; switch (bio_op(bio)) { case REQ_OP_READ: if (bi->flags & BLK_INTEGRITY_NOVERIFY) { if (bi_offload_capable(bi)) return true; set_flags = false; } break; case REQ_OP_WRITE: /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk for non-integrity metadata where nothing else * initializes the memory. */ if (bi->flags & BLK_INTEGRITY_NOGENERATE) { if (bi_offload_capable(bi)) return true; set_flags = false; gfp |= __GFP_ZERO; } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; break; default: return true; } if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return true; /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (!buf) goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); if (!bid) goto err_free_buf; bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (set_flags) { if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) bid->bip.bip_flags |= BIP_CHECK_GUARD; if (bi->flags & BLK_INTEGRITY_REF_TAG) bid->bip.bip_flags |= BIP_CHECK_REFTAG; } if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) goto err_end_io; /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; return true; err_free_buf: kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; } EXPORT_SYMBOL(bio_integrity_prep); void blk_flush_integrity(void) { flush_workqueue(kintegrityd_wq); } static int __init blk_integrity_auto_init(void) { bid_slab = kmem_cache_create("bio_integrity_data", sizeof(struct bio_integrity_data), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) panic("bio: can't create integrity pool\n"); /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. */ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); return 0; } subsys_initcall(blk_integrity_auto_init);
267 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ void reqsk_queue_alloc(struct request_sock_queue *queue) { queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
1 22 22 24 24 24 24 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitfield.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/xarray.h> #include <net/net_namespace.h> #include <net/psp.h> #include <net/udp.h> #include "psp.h" #include "psp-nl-gen.h" DEFINE_XARRAY_ALLOC1(psp_devs); struct mutex psp_devs_lock; /** * DOC: PSP locking * * psp_devs_lock protects the psp_devs xarray. * Ordering is take the psp_devs_lock and then the instance lock. * Each instance is protected by RCU, and has a refcount. * When driver unregisters the instance gets flushed, but struct sticks around. */ /** * psp_dev_check_access() - check if user in a given net ns can access PSP dev * @psd: PSP device structure user is trying to access * @net: net namespace user is in * * Return: 0 if PSP device should be visible in @net, errno otherwise. */ int psp_dev_check_access(struct psp_dev *psd, struct net *net) { if (dev_net(psd->main_netdev) == net) return 0; return -ENOENT; } /** * psp_dev_create() - create and register PSP device * @netdev: main netdevice * @psd_ops: driver callbacks * @psd_caps: device capabilities * @priv_ptr: back-pointer to driver private data * * Return: pointer to allocated PSP device, or ERR_PTR. */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr) { struct psp_dev *psd; static u32 last_id; int err; if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || !psd_ops->key_rotate || !psd_ops->rx_spi_alloc || !psd_ops->tx_key_add || !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); if (!psd) return ERR_PTR(-ENOMEM); psd->main_netdev = netdev; psd->ops = psd_ops; psd->caps = psd_caps; psd->drv_priv = priv_ptr; mutex_init(&psd->lock); INIT_LIST_HEAD(&psd->active_assocs); INIT_LIST_HEAD(&psd->prev_assocs); INIT_LIST_HEAD(&psd->stale_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, &last_id, GFP_KERNEL); if (err) { mutex_unlock(&psp_devs_lock); kfree(psd); return ERR_PTR(err); } mutex_lock(&psd->lock); mutex_unlock(&psp_devs_lock); psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); rcu_assign_pointer(netdev->psp_dev, psd); mutex_unlock(&psd->lock); return psd; } EXPORT_SYMBOL(psp_dev_create); void psp_dev_free(struct psp_dev *psd) { mutex_lock(&psp_devs_lock); xa_erase(&psp_devs, psd->id); mutex_unlock(&psp_devs_lock); mutex_destroy(&psd->lock); kfree_rcu(psd, rcu); } /** * psp_dev_unregister() - unregister PSP device * @psd: PSP device structure */ void psp_dev_unregister(struct psp_dev *psd) { struct psp_assoc *pas, *next; mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); /* Wait until psp_dev_free() to call xa_erase() to prevent a * different psd from being added to the xarray with this id, while * there are still references to this psd being held. */ xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); list_splice_init(&psd->active_assocs, &psd->prev_assocs); list_splice_init(&psd->prev_assocs, &psd->stale_assocs); list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) psp_dev_tx_key_del(psd, pas); rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; psd->drv_priv = NULL; mutex_unlock(&psd->lock); psp_dev_put(psd); } EXPORT_SYMBOL(psp_dev_unregister); unsigned int psp_key_size(u32 version) { switch (version) { case PSP_VERSION_HDR0_AES_GCM_128: case PSP_VERSION_HDR0_AES_GMAC_128: return 16; case PSP_VERSION_HDR0_AES_GCM_256: case PSP_VERSION_HDR0_AES_GMAC_256: return 32; default: return 0; } } EXPORT_SYMBOL(psp_key_size); static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, unsigned int udp_len, __be16 sport) { struct udphdr *uh = udp_hdr(skb); struct psphdr *psph = (struct psphdr *)(uh + 1); uh->dest = htons(PSP_DEFAULT_UDP_PORT); uh->source = udp_flow_src_port(net, skb, 0, 0, false); uh->check = 0; uh->len = htons(udp_len); psph->nexthdr = IPPROTO_TCP; psph->hdrlen = PSP_HDRLEN_NOOPT; psph->crypt_offset = 0; psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | FIELD_PREP(PSPHDR_VERFL_ONE, 1); psph->spi = spi; memset(&psph->iv, 0, sizeof(psph->iv)); } /* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling * them in. */ bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport) { u32 network_len = skb_network_header_len(skb); u32 ethr_len = skb_mac_header_len(skb); u32 bufflen = ethr_len + network_len; if (skb_cow_head(skb, PSP_ENCAP_HLEN)) return false; skb_push(skb, PSP_ENCAP_HLEN); skb->mac_header -= PSP_ENCAP_HLEN; skb->network_header -= PSP_ENCAP_HLEN; skb->transport_header -= PSP_ENCAP_HLEN; memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); if (skb->protocol == htons(ETH_P_IP)) { ip_hdr(skb)->protocol = IPPROTO_UDP; be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); ip_hdr(skb)->check = 0; ip_hdr(skb)->check = ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); } else if (skb->protocol == htons(ETH_P_IPV6)) { ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); } else { return false; } skb_set_inner_ipproto(skb, IPPROTO_TCP); skb_set_inner_transport_header(skb, skb_transport_offset(skb) + PSP_ENCAP_HLEN); skb->encapsulation = 1; psp_write_headers(net, skb, spi, ver, skb->len - skb_transport_offset(skb), sport); return true; } EXPORT_SYMBOL(psp_dev_encapsulate); /* Receive handler for PSP packets. * * Presently it accepts only already-authenticated packets and does not * support optional fields, such as virtualization cookies. The caller should * ensure that skb->data is pointing to the mac header, and that skb->mac_len * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE * is not supported). */ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) { int l2_hlen = 0, l3_hlen, encap; struct psp_skb_ext *pse; struct psphdr *psph; struct ethhdr *eth; struct udphdr *uh; __be16 proto; bool is_udp; eth = (struct ethhdr *)skb->data; proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); if (proto == htons(ETH_P_IP)) l3_hlen = sizeof(struct iphdr); else if (proto == htons(ETH_P_IPV6)) l3_hlen = sizeof(struct ipv6hdr); else return -EINVAL; if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) return -EINVAL; if (proto == htons(ETH_P_IP)) { struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); is_udp = iph->protocol == IPPROTO_UDP; l3_hlen = iph->ihl * 4; if (l3_hlen != sizeof(struct iphdr) && !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) return -EINVAL; } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); is_udp = ipv6h->nexthdr == IPPROTO_UDP; } if (unlikely(!is_udp)) return -EINVAL; uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) return -EINVAL; pse = skb_ext_add(skb, SKB_EXT_PSP); if (!pse) return -EINVAL; psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + sizeof(struct udphdr)); pse->spi = psph->spi; pse->dev_id = dev_id; pse->generation = generation; pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); encap = PSP_ENCAP_HLEN; encap += strip_icv ? PSP_TRL_SIZE : 0; if (proto == htons(ETH_P_IP)) { struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); iph->protocol = psph->nexthdr; iph->tot_len = htons(ntohs(iph->tot_len) - encap); iph->check = 0; iph->check = ip_fast_csum((u8 *)iph, iph->ihl); } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); ipv6h->nexthdr = psph->nexthdr; ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); } memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); skb_pull(skb, PSP_ENCAP_HLEN); if (strip_icv) pskb_trim(skb, skb->len - PSP_TRL_SIZE); return 0; } EXPORT_SYMBOL(psp_dev_rcv); static int __init psp_init(void) { mutex_init(&psp_devs_lock); return genl_register_family(&psp_nl_family); } subsys_initcall(psp_init);
12 5 1 3 1 11 9 11 2 11 13 13 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 // SPDX-License-Identifier: GPL-2.0-only /* * GHASH: hash function for GCM (Galois/Counter Mode). * * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> * Copyright (c) 2009 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ /* * GHASH is a keyed hash function used in GCM authentication tag generation. * * The original GCM paper [1] presents GHASH as a function GHASH(H, A, C) which * takes a 16-byte hash key H, additional authenticated data A, and a ciphertext * C. It formats A and C into a single byte string X, interprets X as a * polynomial over GF(2^128), and evaluates this polynomial at the point H. * * However, the NIST standard for GCM [2] presents GHASH as GHASH(H, X) where X * is the already-formatted byte string containing both A and C. * * "ghash" in the Linux crypto API uses the 'X' (pre-formatted) convention, * since the API supports only a single data stream per hash. Thus, the * formatting of 'A' and 'C' is done in the "gcm" template, not in "ghash". * * The reason "ghash" is separate from "gcm" is to allow "gcm" to use an * accelerated "ghash" when a standalone accelerated "gcm(aes)" is unavailable. * It is generally inappropriate to use "ghash" for other purposes, since it is * an "ε-almost-XOR-universal hash function", not a cryptographic hash function. * It can only be used securely in crypto modes specially designed to use it. * * [1] The Galois/Counter Mode of Operation (GCM) * (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.694.695&rep=rep1&type=pdf) * [2] Recommendation for Block Cipher Modes of Operation: Galois/Counter Mode (GCM) and GMAC * (https://csrc.nist.gov/publications/detail/sp/800-38d/final) */ #include <crypto/gf128mul.h> #include <crypto/ghash.h> #include <crypto/internal/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); memset(dctx, 0, sizeof(*dctx)); return 0; } static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); be128 k; if (keylen != GHASH_BLOCK_SIZE) return -EINVAL; if (ctx->gf128) gf128mul_free_4k(ctx->gf128); BUILD_BUG_ON(sizeof(k) != GHASH_BLOCK_SIZE); memcpy(&k, key, GHASH_BLOCK_SIZE); /* avoid violating alignment rules */ ctx->gf128 = gf128mul_init_4k_lle(&k); memzero_explicit(&k, GHASH_BLOCK_SIZE); if (!ctx->gf128) return -ENOMEM; return 0; } static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; do { crypto_xor(dst, src, GHASH_BLOCK_SIZE); gf128mul_4k_lle((be128 *)dst, ctx->gf128); src += GHASH_BLOCK_SIZE; srclen -= GHASH_BLOCK_SIZE; } while (srclen >= GHASH_BLOCK_SIZE); return srclen; } static void ghash_flush(struct shash_desc *desc, const u8 *src, unsigned int len) { struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); u8 *dst = dctx->buffer; if (len) { crypto_xor(dst, src, len); gf128mul_4k_lle((be128 *)dst, ctx->gf128); } } static int ghash_finup(struct shash_desc *desc, const u8 *src, unsigned int len, u8 *dst) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); u8 *buf = dctx->buffer; ghash_flush(desc, src, len); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; } static void ghash_exit_tfm(struct crypto_tfm *tfm) { struct ghash_ctx *ctx = crypto_tfm_ctx(tfm); if (ctx->gf128) gf128mul_free_4k(ctx->gf128); } static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, .finup = ghash_finup, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-generic", .cra_priority = 100, .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, .cra_exit = ghash_exit_tfm, }, }; static int __init ghash_mod_init(void) { return crypto_register_shash(&ghash_alg); } static void __exit ghash_mod_exit(void) { crypto_unregister_shash(&ghash_alg); } module_init(ghash_mod_init); module_exit(ghash_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("GHASH hash function"); MODULE_ALIAS_CRYPTO("ghash"); MODULE_ALIAS_CRYPTO("ghash-generic");
598 20361 16081 16009 28 15856 7560 9157 466 465 8 4165 4174 44 474 127 34 71 604 352 20 404 582 68 1 5206 2687 622 99 2266 10710 8031 1813 171 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/container_of.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <asm/barrier.h> /* * Circular doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself * @name: name of the list_head */ #define LIST_HEAD_INIT(name) { &(name), &(name) } /** * LIST_HEAD - definition of a &struct list_head with initialization values * @name: name of the list_head */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) /** * INIT_LIST_HEAD - Initialize a list_head structure * @list: list_head structure to be initialized. * * Initializes the list_head to point to itself. If it is a list header, * the result is an empty list. */ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } #ifdef CONFIG_LIST_HARDENED #ifdef CONFIG_DEBUG_LIST # define __list_valid_slowpath #else # define __list_valid_slowpath __cold __preserve_most #endif /* * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, struct list_head *prev, struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_add_valid_or_report(). */ static __always_inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { /* * With the hardening version, elide checking if next and prev * are NULL, since the immediate dereference of them below would * result in a fault if NULL. * * With the reduced set of checks, we can afford to inline the * checks, which also gives the compiler a chance to elide some * of them completely if they can be proven at compile-time. If * one of the pre-conditions does not hold, the slow-path will * show a report which pre-condition failed. */ if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) return true; ret = false; } ret &= __list_add_valid_or_report(new, prev, next); return ret; } /* * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a * corruption is detected, true otherwise. * * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking * inline to catch non-faulting corruptions, and only if a corruption is * detected calls the reporting function __list_del_entry_valid_or_report(). */ static __always_inline bool __list_del_entry_valid(struct list_head *entry) { bool ret = true; if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { struct list_head *prev = entry->prev; struct list_head *next = entry->next; /* * With the hardening version, elide checking if next and prev * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate * dereference of them below would result in a fault. */ if (likely(prev->next == entry && next->prev == entry)) return true; ret = false; } ret &= __list_del_entry_valid_or_report(entry); return ret; } #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /* * Delete a list entry and clear the 'prev' pointer. * * This is a special-purpose list clearing method used in the networking code * for lists allocated as per-cpu, where we don't want to incur the extra * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this * needs to check the node 'prev' pointer instead of calling list_empty(). */ static inline void __list_del_clearprev(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = NULL; } static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_replace_init - replace old entry by new one and initialize the old one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_del(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add(entry1, pos); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_bulk_move_tail - move a subsection of a list to its tail * @head: the head that will follow our entry * @first: first entry to move * @last: last entry to move, can be the same as first * * Move all entries between @first and including @last before @head. * All three entries must belong to the same linked list. */ static inline void list_bulk_move_tail(struct list_head *head, struct list_head *first, struct list_head *last) { first->prev->next = last->next; last->next->prev = first->prev; head->prev->next = first; first->prev = head->prev; last->next = head; head->prev = last; } /** * list_is_first -- tests whether @list is the first entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_is_head - tests whether @list is the list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_head(const struct list_head *list, const struct list_head *head) { return list == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_del_init_careful - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. * * This is the same as list_del_init(), except designed to be used * together with list_empty_careful() in a way to guarantee ordering * of other memory operations. * * Any memory operations done before a list_del_init_careful() are * guaranteed to be visible after a list_empty_careful() test. */ static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_rotate_to_front() - Rotate list to specific item. * @list: The desired new front of the list. * @head: The head of the list. * * Rotates list so that @list becomes the new front of the list. */ static inline void list_rotate_to_front(struct list_head *list, struct list_head *head) { /* * Deletes the list head from the list denoted by @head and * places it as the tail of @list, this effectively rotates the * list so that @list is at the front. */ list_move_tail(head, list); } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_last_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->prev); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_next_entry_circular - get the next element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the last element (return the first element). * Note, that list is expected to be not empty. */ #define list_next_entry_circular(pos, head, member) \ (list_is_last(&(pos)->member, head) ? \ list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_prev_entry_circular - get the prev element in list * @pos: the type * to cursor. * @head: the list head to take the element from. * @member: the name of the list_head within the struct. * * Wraparound if pos is the first element (return the last element). * Note, that list is expected to be not empty. */ #define list_prev_entry_circular(pos, head, member) \ (list_is_first(&(pos)->member, head) ? \ list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; \ !list_is_head(pos, (head)); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** * list_count_nodes - count nodes in the list * @head: the head for your list. */ static inline size_t list_count_nodes(struct list_head *head) { struct list_head *pos; size_t count = 0; list_for_each(pos, head) count++; return count; } /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } /** * hlist_unhashed - Has node been removed from list and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed * state. For example, hlist_nulls_del_init_rcu() does leave the * node in unhashed state, but hlist_nulls_del() does not. */ static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } /** * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use * @h: Node to be checked * * This variant of hlist_unhashed() must be used in lockless contexts * to avoid potential load-tearing. The READ_ONCE() is paired with the * various WRITE_ONCE() in hlist helpers that are defined below. */ static inline int hlist_unhashed_lockless(const struct hlist_node *h) { return !READ_ONCE(h->pprev); } /** * hlist_empty - Is the specified hlist_head structure an empty hlist? * @h: Structure to check. */ static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) WRITE_ONCE(next->pprev, pprev); } /** * hlist_del - Delete the specified hlist_node from its list * @n: Node to delete. * * Note that this function leaves the node in hashed state. Use * hlist_del_init() or similar instead to unhash @n. */ static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_init - Delete the specified hlist_node from its list and initialize * @n: Node to delete. * * Note that this function leaves the node in unhashed state. */ static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /** * hlist_add_head - add a new entry at the beginning of the hlist * @n: new entry to be added * @h: hlist head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; WRITE_ONCE(n->next, first); if (first) WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n); WRITE_ONCE(n->pprev, &h->first); } /** * hlist_add_before - add a new entry before the one specified * @n: new entry to be added * @next: hlist node to add it before, which must be non-NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); WRITE_ONCE(n->next, next); WRITE_ONCE(next->pprev, &n->next); WRITE_ONCE(*(n->pprev), n); } /** * hlist_add_behind - add a new entry after the one specified * @n: new entry to be added * @prev: hlist node to add it after, which must be non-NULL */ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { WRITE_ONCE(n->next, prev->next); WRITE_ONCE(prev->next, n); WRITE_ONCE(n->pprev, &prev->next); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } /** * hlist_add_fake - create a fake hlist consisting of a single headless node * @n: Node to make a fake list out of * * This makes @n appear to be its own predecessor on a headless hlist. * The point of this is to allow things like hlist_del() to work correctly * in cases where there is no list. */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } /** * hlist_fake: Is this node a fake hlist? * @h: Node to check for being a self-referential fake hlist. */ static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /** * hlist_is_singular_node - is node the only element of the specified hlist? * @n: Node to check for singularity. * @h: Header for potentially singular list. * * Check whether the node is the only node of the head without * accessing head, thus avoiding unnecessary cache misses. */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /** * hlist_move_list - Move an hlist * @old: hlist_head for old list. * @new: hlist_head for new list. * * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } /** * hlist_splice_init() - move all entries from one list to another * @from: hlist_head from which entries will be moved * @last: last entry on the @from list * @to: hlist_head to which entries will be moved * * @to can be empty, @from must contain at least @last. */ static inline void hlist_splice_init(struct hlist_head *from, struct hlist_node *last, struct hlist_head *to) { if (to->first) to->first->pprev = &last->next; last->next = to->first; to->first = from->first; from->first->pprev = &to->first; from->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: a &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) /** * hlist_count_nodes - count nodes in the hlist * @head: the head for your hlist. */ static inline size_t hlist_count_nodes(struct hlist_head *head) { struct hlist_node *pos; size_t count = 0; hlist_for_each(pos, head) count++; return count; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PROCESSOR_H #define _ASM_X86_PROCESSOR_H #include <asm/processor-flags.h> /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; struct io_bitmap; struct vm86; #include <asm/math_emu.h> #include <asm/segment.h> #include <asm/types.h> #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> #include <asm/cpuid/api.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> #include <asm/desc_defs.h> #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> #include <asm/vdso/processor.h> #include <asm/shstk.h> #include <linux/personality.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> #include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand * unaligned DMA can be quite expensive on some Nehalem processors. * * Based on this we disable the IP header alignment in network drivers. */ #define NET_IP_ALIGN 0 #define HBP_NUM 4 /* * These alignment constraints are for performance in the vSMP case, * but in the task_struct case we must also meet hardware imposed * alignment requirements of the FPU state: */ #ifdef CONFIG_X86_VSMP # define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) # define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else # define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif extern u16 __read_mostly tlb_lli_4k; extern u16 __read_mostly tlb_lli_2m; extern u16 __read_mostly tlb_lli_4m; extern u16 __read_mostly tlb_lld_4k; extern u16 __read_mostly tlb_lld_2m; extern u16 __read_mostly tlb_lld_4m; extern u16 __read_mostly tlb_lld_1g; /* * CPU type and hardware bug flags. Kept separately for each CPU. */ struct cpuinfo_topology { // Real APIC ID read from the local APIC u32 apicid; // The initial APIC ID provided by CPUID u32 initial_apicid; // Physical package ID u32 pkg_id; // Physical die ID on AMD, Relative on Intel u32 die_id; // Compute unit ID - AMD specific u32 cu_id; // Core ID relative to the package u32 core_id; // Logical ID mappings u32 logical_pkg_id; u32 logical_die_id; u32 logical_core_id; // AMD Node ID and Nodes per Package info u32 amd_node_id; // Cache level topology IDs u32 llc_id; u32 l2c_id; // Hardware defined CPU-type union { u32 cpu_type; struct { // CPUID.1A.EAX[23-0] u32 intel_native_model_id :24; // CPUID.1A.EAX[31-24] u32 intel_type :8; }; struct { // CPUID 0x80000026.EBX u32 amd_num_processors :16, amd_power_eff_ranking :8, amd_native_model_id :4, amd_type :4; }; }; }; struct cpuinfo_x86 { union { /* * The particular ordering (low-to-high) of (vendor, * family, model) is done in case range of models, like * it is usually done on AMD, need to be compared. */ struct { __u8 x86_model; /* CPU family */ __u8 x86; /* CPU vendor */ __u8 x86_vendor; __u8 x86_reserved; }; /* combined vendor, family, model */ __u32 x86_vfm; }; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; #endif #ifdef CONFIG_X86_VMX_FEATURE_NAMES __u32 vmx_capability[NVMXINTS]; #endif __u8 x86_virt_bits; __u8 x86_phys_bits; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ int cpuid_level; /* * Align to size of unsigned long because the x86_capability array * is passed to bitops which require the alignment. Use unnamed * union to enforce the array is aligned to size of unsigned long. */ union { __u32 x86_capability[NCAPINTS + NBUGINTS]; unsigned long x86_capability_alignment; }; char x86_vendor_id[16]; char x86_model_id[64]; struct cpuinfo_topology topo; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; /* Index into per_cpu list: */ u16 cpu_index; /* Is SMT active on this core? */ bool smt_active; u32 microcode; /* Address space bits used by the cache internally */ u8 x86_cache_bits; unsigned initialized : 1; } __randomize_layout; #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 #define X86_VENDOR_UMC 3 #define X86_VENDOR_CENTAUR 5 #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 #define X86_VENDOR_ZHAOXIN 10 #define X86_VENDOR_VORTEX 11 #define X86_VENDOR_NUM 12 #define X86_VENDOR_UNKNOWN 0xff /* * capabilities of CPUs */ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); static inline unsigned long long l1tf_pfn_limit(void) { return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } void init_cpu_devs(void); void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); /* * Friendlier CR3 helpers. */ static inline unsigned long read_cr3_pa(void) { return __read_cr3() & CR3_ADDR_MASK; } static inline unsigned long native_read_cr3_pa(void) { return __native_read_cr3() & CR3_ADDR_MASK; } static inline void load_cr3(pgd_t *pgdir) { write_cr3(__sme_pa(pgdir)); } /* * Note that while the legacy 'TSS' name comes from 'Task State Segment', * on modern x86 CPUs the TSS also holds information important to 64-bit mode, * unrelated to the task-switch mechanism: */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; unsigned long sp1; /* * We don't use ring 1, so ss1 is a convenient scratch space in * the same cacheline as sp0. We use ss1 to cache the value in * MSR_IA32_SYSENTER_CS. When we context switch * MSR_IA32_SYSENTER_CS, we first check if the new value being * written matches ss1, and, if it's not, then we wrmsr the new * value and update ss1. * * The only reason we context switch MSR_IA32_SYSENTER_CS is * that we set it to zero in vm86 tasks to avoid corrupting the * stack if we were to go through the sysenter path from vm86 * mode. */ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; unsigned long ip; unsigned long flags; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long bx; unsigned long sp; unsigned long bp; unsigned long si; unsigned long di; unsigned short es, __esh; unsigned short cs, __csh; unsigned short ss, __ssh; unsigned short ds, __dsh; unsigned short fs, __fsh; unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace; unsigned short io_bitmap_base; } __attribute__((packed)); #else struct x86_hw_tss { u32 reserved1; u64 sp0; u64 sp1; /* * Since Linux does not use ring 2, the 'sp2' slot is unused by * hardware. entry_SYSCALL_64 uses it as scratch space to stash * the user RSP value. */ u64 sp2; u64 reserved2; u64 ist[7]; u32 reserved3; u32 reserved4; u16 reserved5; u16 io_bitmap_base; } __attribute__((packed)); #endif /* * IO-bitmap sizes: */ #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long)) #define IO_BITMAP_OFFSET_VALID_MAP \ (offsetof(struct tss_struct, io_bitmap.bitmap) - \ offsetof(struct tss_struct, x86_tss)) #define IO_BITMAP_OFFSET_VALID_ALL \ (offsetof(struct tss_struct, io_bitmap.mapall) - \ offsetof(struct tss_struct, x86_tss)) #ifdef CONFIG_X86_IOPL_IOPERM /* * sizeof(unsigned long) coming from an extra "long" at the end of the * iobitmap. The limit is inclusive, i.e. the last valid byte. */ # define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \ sizeof(unsigned long) - 1) #else # define __KERNEL_TSS_LIMIT \ (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1) #endif /* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */ #define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1) struct entry_stack { char stack[PAGE_SIZE]; }; struct entry_stack_page { struct entry_stack stack; } __aligned(PAGE_SIZE); /* * All IO bitmap related data stored in the TSS: */ struct x86_io_bitmap { /* The sequence number of the last active bitmap. */ u64 prev_sequence; /* * Store the dirty size of the last io bitmap offender. The next * one will have to do the cleanup as the switch out to a non io * bitmap user will just set x86_tss.io_bitmap_base to a value * outside of the TSS limit. So for sane tasks there is no need to * actually touch the io_bitmap at all. */ unsigned int prev_max; /* * The extra 1 is there because the CPU will access an * additional byte beyond the end of the IO permission * bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ unsigned long bitmap[IO_BITMAP_LONGS + 1]; /* * Special I/O bitmap to emulate IOPL(3). All bytes zero, * except the additional byte at the end. */ unsigned long mapall[IO_BITMAP_LONGS + 1]; }; struct tss_struct { /* * The fixed hardware portion. This must not cross a page boundary * at risk of violating the SDM's advice and potentially triggering * errata. */ struct x86_hw_tss x86_tss; struct x86_io_bitmap io_bitmap; } __aligned(PAGE_SIZE); DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* Per CPU interrupt stacks */ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); #ifdef CONFIG_X86_64 DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); #else DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); #endif DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack); /* const-qualified alias provided by the linker. */ DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override, const_cpu_current_top_of_stack); #ifdef CONFIG_X86_64 static inline unsigned long cpu_kernelmode_gs_base(int cpu) { #ifdef CONFIG_SMP return per_cpu_offset(cpu); #else return 0; #endif } extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); #endif /* X86_64 */ struct perf_event; struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; #ifdef CONFIG_X86_32 unsigned long sp0; #endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else unsigned short es; unsigned short ds; unsigned short fsindex; unsigned short gsindex; #endif #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; #else /* * XXX: this could presumably be unsigned short. Alternatively, * 32-bit kernels could be taught to use fsindex instead. */ unsigned long fs; unsigned long gs; #endif /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long virtual_dr6; /* Keep track of the exact dr7 value set by the user */ unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_nr; unsigned long error_code; #ifdef CONFIG_VM86 /* Virtual 86 mode info */ struct vm86 *vm86; #endif /* IO permissions: */ struct io_bitmap *io_bitmap; /* * IOPL. Privilege level dependent I/O permission which is * emulated via the I/O bitmap to prevent user space from disabling * interrupts. */ unsigned long iopl_emul; unsigned int iopl_warn:1; /* * Protection Keys Register for Userspace. Loaded immediately on * context switch. Store it in thread_struct to avoid a lookup in * the tasks's FPU xstate buffer. This value is only valid when a * task is scheduled out. For 'current' the authoritative source of * PKRU is the hardware itself. */ u32 pkru; #ifdef CONFIG_X86_USER_SHADOW_STACK unsigned long features; unsigned long features_locked; struct thread_shstk shstk; #endif }; #ifdef CONFIG_X86_DEBUG_FPU extern struct fpu *x86_task_fpu(struct task_struct *task); #else # define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) #endif extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { fpu_thread_struct_whitelist(offset, size); } static inline void native_load_sp0(unsigned long sp0) { this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static __always_inline void native_swapgs(void) { #ifdef CONFIG_X86_64 asm volatile("swapgs" ::: "memory"); #endif } static __always_inline unsigned long current_top_of_stack(void) { /* * We can't read directly from tss.sp0: sp0 on x86_32 is special in * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) return this_cpu_read_const(const_cpu_current_top_of_stack); return this_cpu_read_stable(cpu_current_top_of_stack); } static __always_inline bool on_thread_stack(void) { return (unsigned long)(current_top_of_stack() - current_stack_pointer) < THREAD_SIZE; } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline void load_sp0(unsigned long sp0) { native_load_sp0(sp0); } #endif /* CONFIG_PARAVIRT_XXL */ unsigned long __get_wchan(struct task_struct *p); extern void select_idle_routine(void); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern void enable_sep_cpu(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); extern void cpu_init_exception_handling(bool boot_cpu); extern void cpu_init_replace_early_idt(void); extern void cr4_init(void); extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ extern int bootloader_type; extern int bootloader_version; extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW #ifdef CONFIG_X86_32 # define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else # define BASE_PREFETCH "prefetcht0 %1" #endif /* * Prefetch instructions for Pentium III (+) and AMD Athlon (+) * * It's not worth to care about 3dnow prefetches for the K6 * because they are microcoded there and very slow. */ static inline void prefetch(const void *x) { alternative_input(BASE_PREFETCH, "prefetchnta %1", X86_FEATURE_XMM, "m" (*(const char *)x)); } /* * 3dnow prefetch to get an exclusive cache line. * Useful for spinlocks to avoid one state transition in the * cache coherency protocol: */ static __always_inline void prefetchw(const void *x) { alternative_input(BASE_PREFETCH, "prefetchw %1", X86_FEATURE_3DNOWPREFETCH, "m" (*(const char *)x)); } #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) #define task_pt_regs(task) \ ({ \ unsigned long __ptr = (unsigned long)task_stack_page(task); \ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ ((struct pt_regs *)__ptr) - 1; \ }) #ifdef CONFIG_X86_32 #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ } #else extern unsigned long __top_init_kernel_stack[]; #define INIT_THREAD { \ .sp = (unsigned long)&__top_init_kernel_stack, \ } #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); /* * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) #define KSTK_ESP(task) (task_pt_regs(task)->sp) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) #define SET_TSC_CTL(val) set_tsc_mode((val)) extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); static inline u32 per_cpu_llc_id(unsigned int cpu) { return per_cpu(cpu_info.topo.llc_id, cpu); } static inline u32 per_cpu_l2c_id(unsigned int cpu) { return per_cpu(cpu_info.topo.l2c_id, cpu); } #ifdef CONFIG_CPU_SUP_AMD /* * Issue a DIV 0/1 insn to clear any division data from previous DIV * operations. */ static __always_inline void amd_clear_divider(void) { asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) :: "a" (0), "d" (0), "r" (1)); } extern void amd_check_microcode(void); #else static inline void amd_clear_divider(void) { } static inline void amd_check_microcode(void) { } #endif extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); extern void free_kernel_image_pages(const char *what, void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN bool xen_set_default_idle(void); #else #define xen_set_default_idle 0 #endif void __noreturn stop_this_cpu(void *dummy); void microcode_check(struct cpuinfo_x86 *prev_info); void store_cpu_caps(struct cpuinfo_x86 *info); DECLARE_PER_CPU(bool, cache_state_incoherent); enum l1tf_mitigations { L1TF_MITIGATION_OFF, L1TF_MITIGATION_AUTO, L1TF_MITIGATION_FLUSH_NOWARN, L1TF_MITIGATION_FLUSH, L1TF_MITIGATION_FLUSH_NOSMT, L1TF_MITIGATION_FULL, L1TF_MITIGATION_FULL_FORCE }; extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, MDS_MITIGATION_AUTO, MDS_MITIGATION_FULL, MDS_MITIGATION_VMWERV, }; extern bool gds_ucode_mitigated(void); /* * Make previous memory operations globally visible before * a WRMSR. * * MFENCE makes writes visible, but only affects load/store * instructions. WRMSR is unfortunately not a load/store * instruction and is unaffected by MFENCE. The LFENCE ensures * that the WRMSR is not reordered. * * Most WRMSRs are full serializing instructions themselves and * do not require this barrier. This is only required for the * IA32_TSC_DEADLINE and X2APIC MSRs. */ static inline void weak_wrmsr_fence(void) { alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); } #endif /* _ASM_X86_PROCESSOR_H */
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(&current->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; fallthrough; case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; }
194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0-only /* * Common code for control of lockd and nfsv4 grace periods. * * Transplanted from lockd code */ #include <linux/module.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> #include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given * out. Currently grace periods are only enforced by the two lock * managers (lockd and nfsd), using the locks_in_grace() function to * check when they are in a grace period. * * This function is called to start a grace period. */ void locks_start_grace(struct net *net, struct lock_manager *lm) { struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); if (list_empty(&lm->list)) list_add(&lm->list, grace_list); else WARN(1, "double list_add attempt detected in net %x %s\n", net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to * resume regular locking. The grace period will not end until all lock * managers that called locks_start_grace() also call locks_end_grace(). * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ void locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_end_grace); static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); struct lock_manager *lm; if (!open) return !list_empty(grace_list); spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { if (lm->block_opens) { spin_unlock(&grace_lock); return true; } } spin_unlock(&grace_lock); return false; } /** * locks_in_grace * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ bool locks_in_grace(struct net *net) { return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); bool opens_in_grace(struct net *net) { return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); static int __net_init grace_init_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); INIT_LIST_HEAD(grace_list); return 0; } static void __net_exit grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); WARN_ONCE(!list_empty(grace_list), "net %x %s: grace_list is not empty\n", net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { .init = grace_init_net, .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), }; static int __init init_grace(void) { return register_pernet_subsys(&grace_net_ops); } static void __exit exit_grace(void) { unregister_pernet_subsys(&grace_net_ops); } MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); MODULE_DESCRIPTION("NFS client and server infrastructure"); MODULE_LICENSE("GPL"); module_init(init_grace) module_exit(exit_grace)
505 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_DEFS_H #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> #include <linux/mem_encrypt.h> #include <asm/page_types.h> #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ #define _PAGE_BIT_USER 2 /* userspace addressable */ #define _PAGE_BIT_PWT 3 /* page write through */ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SOFTW4 57 /* available for programmer */ #define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ #define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #ifdef CONFIG_X86_64 #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */ #else /* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */ #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */ #endif /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3) #else #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) #else #define _PAGE_KNL_ERRATUM_MASK 0 #endif #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 1-4 are *not* involved * into swap entry computation, but bit 7 is used for thp migration, * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) #define _PAGE_SWP_UFFD_WP _PAGE_USER #else #define _PAGE_UFFD_WP (_AT(pteval_t, 0)) #define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif /* * The hardware requires shadow stack to be Write=0,Dirty=1. However, * there are valid cases where the kernel might create read-only PTEs that * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have * (Write=0,SavedDirty=1,Dirty=0) set. */ #define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) #define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW) /* * Set of bits not changed in pte_modify. The pte's * protection key is treated like _PAGE_RW, for * instance, and is *not* included in this mask since * pte_modify() does modify it. */ #define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | \ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ _PAGE_CC | _PAGE_UFFD_WP) #define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. * * The resulting bits for PWT, PCD and PAT should be chosen in a way * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ #ifndef __ASSEMBLER__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, _PAGE_CACHE_MODE_UC_MINUS = 2, _PAGE_CACHE_MODE_UC = 3, _PAGE_CACHE_MODE_WT = 4, _PAGE_CACHE_MODE_WP = 5, _PAGE_CACHE_MODE_NUM = 8 }; #endif #define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define __PP _PAGE_PRESENT #define __RW _PAGE_RW #define _USR _PAGE_USER #define ___A _PAGE_ACCESSED #define ___D _PAGE_DIRTY #define ___G _PAGE_GLOBAL #define __NX _PAGE_NX #define _ENC _PAGE_ENC #define __WP _PAGE_CACHE_WP #define __NC _PAGE_NOCACHE #define _PSE _PAGE_PSE #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) /* * Page tables needs to have Write=1 in order for any lower PTEs to be * writable. This includes shadow stack memory (Write=0, Dirty=1) */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) #define __PAGE_KERNEL_IO __PAGE_KERNEL #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE #ifndef __ASSEMBLER__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0) #define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask) #define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC) #define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0) #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) #define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) #define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC) #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLER__ */ /* * early identity mapping pte attrib macros. */ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif #ifdef CONFIG_X86_32 # include <asm/pgtable_32_types.h> #else # include <asm/pgtable_64_types.h> #endif #ifndef __ASSEMBLER__ #include <linux/types.h> /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) /* * Extracts the flags from a (pte|pmd|pud|pgd)val_t * This includes the protection key value. */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; static inline pgprot_t pgprot_nx(pgprot_t prot) { return __pgprot(pgprot_val(prot) | _PAGE_NX); } #define pgprot_nx pgprot_nx #ifdef CONFIG_X86_PAE /* * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't * use it here. */ #define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) #define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) /* * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. * All other bits are Reserved MBZ */ #define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ _PAGE_PWT | _PAGE_PCD | \ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) #else /* No need to mask any bits for !PAE */ #define PGD_ALLOWED_BITS (~0ULL) #endif static inline pgd_t native_make_pgd(pgdval_t val) { return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) { return native_pgd_val(pgd) & PTE_FLAGS_MASK; } #if CONFIG_PGTABLE_LEVELS > 4 typedef struct { p4dval_t p4d; } p4d_t; static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { val }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return p4d.p4d; } #else #include <asm-generic/pgtable-nop4d.h> static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) { return (pud_t) { val }; } static inline pudval_t native_pud_val(pud_t pud) { return pud.pud; } #else #include <asm-generic/pgtable-nopud.h> static inline pud_t native_make_pud(pudval_t val) { return (pud_t) { .p4d.pgd = native_make_pgd(val) }; } static inline pudval_t native_pud_val(pud_t pud) { return native_pgd_val(pud.p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return pmd.pmd; } #else #include <asm-generic/pgtable-nopmd.h> static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return native_pgd_val(pmd.pud.p4d.pgd); } #endif static inline p4dval_t p4d_pfn_mask(p4d_t p4d) { /* No 512 GiB huge pages yet */ return PTE_PFN_MASK; } static inline p4dval_t p4d_flags_mask(p4d_t p4d) { return ~p4d_pfn_mask(p4d); } static inline p4dval_t p4d_flags(p4d_t p4d) { return native_p4d_val(p4d) & p4d_flags_mask(p4d); } static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) { return (pte_t) { .pte = val }; } static inline pteval_t native_pte_val(pte_t pte) { return pte.pte; } static inline pteval_t pte_flags(pte_t pte) { return native_pte_val(pte) & PTE_FLAGS_MASK; } #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ (((cb) >> _PAGE_BIT_PWT) & 1)) #define __cm_idx2pte(i) \ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) unsigned long cachemode2protval(enum page_cache_mode pcm); static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); #define pgprot_writethrough pgprot_writethrough extern pgprot_t pgprot_writethrough(pgprot_t prot); /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else #define native_pagetable_init paging_init #endif enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, PG_LEVEL_256T, PG_LEVEL_NUM }; #ifdef CONFIG_PROC_FS extern void update_page_count(int level, unsigned long pages); #else static inline void update_page_count(int level, unsigned long pages) { } #endif /* * Helper function that returns the kernel pagetable entry controlling * the virtual address 'address'. NULL means no pagetable entry present. * NOTE: the return type is pte_t but if the pmd is PSE then we return it * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */
32 10 10 92 51 41 5 1 3 1 933 193 2 2 434 2 2 1952 1954 1550 1526 1204 1055 934 2783 2782 33 1921 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 #include <linux/init.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_arp.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV4, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, [NF_INET_FORWARD] = nft_do_chain_ipv4, [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; static void nft_chain_filter_ipv4_init(void) { nft_register_chain_type(&nft_chain_filter_ipv4); } static void nft_chain_filter_ipv4_fini(void) { nft_unregister_chain_type(&nft_chain_filter_ipv4); } #else static inline void nft_chain_filter_ipv4_init(void) {} static inline void nft_chain_filter_ipv4_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV4 */ #ifdef CONFIG_NF_TABLES_ARP static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_ARP, .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT), .hooks = { [NF_ARP_IN] = nft_do_chain_arp, [NF_ARP_OUT] = nft_do_chain_arp, }, }; static void nft_chain_filter_arp_init(void) { nft_register_chain_type(&nft_chain_filter_arp); } static void nft_chain_filter_arp_fini(void) { nft_unregister_chain_type(&nft_chain_filter_arp); } #else static inline void nft_chain_filter_arp_init(void) {} static inline void nft_chain_filter_arp_fini(void) {} #endif /* CONFIG_NF_TABLES_ARP */ #ifdef CONFIG_NF_TABLES_IPV6 static unsigned int nft_do_chain_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV6, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, [NF_INET_FORWARD] = nft_do_chain_ipv6, [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, }, }; static void nft_chain_filter_ipv6_init(void) { nft_register_chain_type(&nft_chain_filter_ipv6); } static void nft_chain_filter_ipv6_fini(void) { nft_unregister_chain_type(&nft_chain_filter_ipv6); } #else static inline void nft_chain_filter_ipv6_init(void) {} static inline void nft_chain_filter_ipv6_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV6 */ #ifdef CONFIG_NF_TABLES_INET static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (state->pf) { case NFPROTO_IPV4: nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: nft_set_pktinfo_ipv6(&pkt); break; default: break; } return nft_do_chain(&pkt, priv); } static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_hook_state ingress_state = *state; struct nft_pktinfo pkt; switch (skb->protocol) { case htons(ETH_P_IP): /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */ ingress_state.pf = NFPROTO_IPV4; ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) return NF_DROP; break; case htons(ETH_P_IPV6): ingress_state.pf = NFPROTO_IPV6; ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) return NF_DROP; break; default: return NF_ACCEPT; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_INET, .hook_mask = (1 << NF_INET_INGRESS) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_INGRESS] = nft_do_chain_inet_ingress, [NF_INET_LOCAL_IN] = nft_do_chain_inet, [NF_INET_LOCAL_OUT] = nft_do_chain_inet, [NF_INET_FORWARD] = nft_do_chain_inet, [NF_INET_PRE_ROUTING] = nft_do_chain_inet, [NF_INET_POST_ROUTING] = nft_do_chain_inet, }, }; static void nft_chain_filter_inet_init(void) { nft_register_chain_type(&nft_chain_filter_inet); } static void nft_chain_filter_inet_fini(void) { nft_unregister_chain_type(&nft_chain_filter_inet); } #else static inline void nft_chain_filter_inet_init(void) {} static inline void nft_chain_filter_inet_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV6 */ #if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) static unsigned int nft_do_chain_bridge(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): nft_set_pktinfo_ipv6_validate(&pkt); break; default: nft_set_pktinfo_unspec(&pkt); break; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_BRIDGE, .hook_mask = (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_POST_ROUTING), .hooks = { [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, [NF_BR_LOCAL_IN] = nft_do_chain_bridge, [NF_BR_FORWARD] = nft_do_chain_bridge, [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, [NF_BR_POST_ROUTING] = nft_do_chain_bridge, }, }; static void nft_chain_filter_bridge_init(void) { nft_register_chain_type(&nft_chain_filter_bridge); } static void nft_chain_filter_bridge_fini(void) { nft_unregister_chain_type(&nft_chain_filter_bridge); } #else static inline void nft_chain_filter_bridge_init(void) {} static inline void nft_chain_filter_bridge_fini(void) {} #endif /* CONFIG_NF_TABLES_BRIDGE */ #ifdef CONFIG_NF_TABLES_NETDEV static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (skb->protocol) { case htons(ETH_P_IP): nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): nft_set_pktinfo_ipv6_validate(&pkt); break; default: nft_set_pktinfo_unspec(&pkt); break; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, .hook_mask = (1 << NF_NETDEV_INGRESS) | (1 << NF_NETDEV_EGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, [NF_NETDEV_EGRESS] = nft_do_chain_netdev, }, }; static int nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_base_chain *basechain, bool changename) { struct nft_table *table = basechain->chain.table; struct nf_hook_ops *ops; struct nft_hook *hook; bool match; list_for_each_entry(hook, &basechain->hook_list, list) { ops = nft_hook_find_ops(hook, dev); match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); switch (event) { case NETDEV_UNREGISTER: /* NOP if not found or new name still matching */ if (!ops || (changename && match)) continue; if (!(table->flags & NFT_TABLE_F_DORMANT)) nf_unregister_net_hook(dev_net(dev), ops); list_del_rcu(&ops->list); kfree_rcu(ops, rcu); break; case NETDEV_REGISTER: /* NOP if not matching or already registered */ if (!match || (changename && ops)) continue; ops = kmemdup(&basechain->ops, sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); if (!ops) return 1; ops->dev = dev; if (!(table->flags & NFT_TABLE_F_DORMANT) && nf_register_net_hook(dev_net(dev), ops)) { kfree(ops); return 1; } list_add_tail_rcu(&ops->list, &hook->ops_list); break; } break; } return 0; } static int __nf_tables_netdev_event(unsigned long event, struct net_device *dev, bool changename) { struct nft_base_chain *basechain; struct nftables_pernet *nft_net; struct nft_chain *chain; struct nft_table *table; nft_net = nft_pernet(dev_net(dev)); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; basechain = nft_base_chain(chain); if (table->family == NFPROTO_INET && basechain->ops.hooknum != NF_INET_INGRESS) continue; if (nft_netdev_event(event, dev, basechain, changename)) return 1; } } return 0; } static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; int ret = NOTIFY_DONE; if (event != NETDEV_REGISTER && event != NETDEV_UNREGISTER && event != NETDEV_CHANGENAME) return NOTIFY_DONE; nft_net = nft_pernet(dev_net(dev)); mutex_lock(&nft_net->commit_mutex); if (event == NETDEV_CHANGENAME) { if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { ret = NOTIFY_BAD; goto out_unlock; } __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); } else if (__nf_tables_netdev_event(event, dev, false)) { ret = NOTIFY_BAD; } out_unlock: mutex_unlock(&nft_net->commit_mutex); return ret; } static struct notifier_block nf_tables_netdev_notifier = { .notifier_call = nf_tables_netdev_event, }; static int nft_chain_filter_netdev_init(void) { int err; nft_register_chain_type(&nft_chain_filter_netdev); err = register_netdevice_notifier(&nf_tables_netdev_notifier); if (err) goto err_register_netdevice_notifier; return 0; err_register_netdevice_notifier: nft_unregister_chain_type(&nft_chain_filter_netdev); return err; } static void nft_chain_filter_netdev_fini(void) { nft_unregister_chain_type(&nft_chain_filter_netdev); unregister_netdevice_notifier(&nf_tables_netdev_notifier); } #else static inline int nft_chain_filter_netdev_init(void) { return 0; } static inline void nft_chain_filter_netdev_fini(void) {} #endif /* CONFIG_NF_TABLES_NETDEV */ int __init nft_chain_filter_init(void) { int err; err = nft_chain_filter_netdev_init(); if (err < 0) return err; nft_chain_filter_ipv4_init(); nft_chain_filter_ipv6_init(); nft_chain_filter_arp_init(); nft_chain_filter_inet_init(); nft_chain_filter_bridge_init(); return 0; } void nft_chain_filter_fini(void) { nft_chain_filter_bridge_fini(); nft_chain_filter_inet_fini(); nft_chain_filter_arp_fini(); nft_chain_filter_ipv6_fini(); nft_chain_filter_ipv4_fini(); nft_chain_filter_netdev_fini(); }
12 12 34 1 11 3 6 8 6 4 1 1 2 23 6 8 9 23 18 6 6 6 10 10 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); nft_data_copy(&regs->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 }, [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) { enum nft_data_types type; u8 reg; reg = ntohl(nla_get_be32(nla)); if (reg == NFT_REG_VERDICT) type = NFT_DATA_VERDICT; else type = NFT_DATA_VALUE; return type; } static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc = { .size = sizeof(priv->data), }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; priv->dlen = desc.len; err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], &priv->dreg, &priv->data, desc.type, desc.len); if (err < 0) goto err1; if (priv->dreg == NFT_REG_VERDICT) { struct nft_chain *chain = priv->data.verdict.chain; switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: err = nf_tables_bind_chain(ctx, chain); if (err < 0) goto err1; break; default: break; } } return 0; err1: nft_data_release(&priv->data, desc.type); return err; } static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_ctx chain_ctx; struct nft_chain *chain; struct nft_rule *rule; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_activate(&chain_ctx, rule); nft_clear(ctx->net, chain); break; default: break; } } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, struct nft_chain *chain, enum nft_trans_phase phase) { struct nft_ctx chain_ctx; struct nft_rule *rule; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_deactivate(&chain_ctx, rule, phase); } static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_chain *chain; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; switch (phase) { case NFT_TRANS_PREPARE_ERROR: nf_tables_unbind_chain(ctx, chain); nft_deactivate_next(ctx->net, chain); break; case NFT_TRANS_PREPARE: nft_immediate_chain_deactivate(ctx, chain, phase); nft_deactivate_next(ctx->net, chain); break; default: nft_immediate_chain_deactivate(ctx, chain, phase); nft_chain_del(chain); chain->bound = false; nft_use_dec(&chain->table->use); break; } break; default: break; } } if (phase == NFT_TRANS_COMMIT) return; return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_rule *rule, *n; struct nft_ctx chain_ctx; struct nft_chain *chain; if (priv->dreg != NFT_REG_VERDICT) return; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; /* Rule construction failed, but chain is already bound: * let the transaction records release this chain and its rules. */ if (chain->bound) { nft_use_dec(&chain->use); break; } /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; nft_use_dec(&chain->use); list_for_each_entry_safe(rule, n, &chain->rules, list) { nft_use_dec(&chain->use); list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } nf_tables_chain_destroy(chain); break; default: break; } } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_IMMEDIATE_DREG, priv->dreg)) goto nla_put_failure; return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, nft_dreg_to_type(priv->dreg), priv->dlen); nla_put_failure: return -1; } static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; if (priv->dreg != NFT_REG_VERDICT) return 0; data = &priv->data; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; pctx->level--; break; default: break; } return 0; } static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_immediate_expr *priv) { struct flow_action_entry *entry; const struct nft_data *data; entry = &flow->rule->action.entries[ctx->num_actions++]; data = &priv->data; switch (data->verdict.code) { case NF_ACCEPT: entry->id = FLOW_ACTION_ACCEPT; break; case NF_DROP: entry->id = FLOW_ACTION_DROP; break; default: return -EOPNOTSUPP; } return 0; } static int nft_immediate_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return nft_immediate_offload_verdict(ctx, flow, priv); memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data)); return 0; } static bool nft_immediate_offload_action(const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return true; return false; } static bool nft_immediate_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg != NFT_REG_VERDICT) nft_reg_track_cancel(track, priv->dreg, priv->dlen); return false; } static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), .eval = nft_immediate_eval, .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .reduce = nft_immediate_reduce, .offload = nft_immediate_offload, .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { .name = "immediate", .ops = &nft_imm_ops, .policy = nft_immediate_policy, .maxattr = NFTA_IMMEDIATE_MAX, .owner = THIS_MODULE, };
100 26 100 43 43 29 38 6 43 43 18 34 437 825 437 824 436 433 11 437 100 437 413 30 268 824 638 268 268 69 268 268 825 824 825 825 824 824 824 823 823 2 824 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; /* Limit batching if we have delayed rmaps pending */ if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_page(GFP_NOWAIT); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { struct encoded_page **pages = batch->encoded_pages; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); unsigned int nr_pages = 1; if (unlikely(encoded_page_flags(enc) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages = encoded_nr_pages(pages[++i]); folio_remove_rmap_ptes(page_folio(page), page, nr_pages, vma); } } } /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so * we only need to walk through the current active batch and the * original local one. */ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (!tlb->delayed_rmap) return; tlb_flush_rmap_batch(&tlb->local, vma); if (tlb->active != &tlb->local) tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif /* * We might end up freeing a lot of pages. Reschedule on a regular * basis to avoid soft lockups in configurations without full * preemption enabled. The magic number of 512 folios seems to work. */ #define MAX_NR_FOLIOS_PER_FREE 512 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { struct encoded_page **pages = batch->encoded_pages; unsigned int nr, nr_pages; while (batch->nr) { if (!page_poisoning_enabled_static() && !want_init_on_free()) { nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave * nr_pages behind when capping the number of entries. */ if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; } else { /* * With page poisoning and init_on_free, the time it * takes to free memory grows proportionally with the * actual memory size. Therefore, limit based on the * actual memory size and not the number of involved * folios. */ for (nr = 0, nr_pages = 0; nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; nr++) { if (unlikely(encoded_page_flags(pages[nr]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages += encoded_nr_pages(pages[++nr]); else nr_pages++; } } free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); } } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap, int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ if (likely(nr_pages == 1)) { batch->encoded_pages[batch->nr++] = encode_page(page, flags); } else { flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. */ if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) { return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, PAGE_SIZE); } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Not all systems IPI every CPU for this purpose: * * - Some architectures have HW support for cross-CPU synchronisation of TLB * flushes, so there's no IPI at all. * * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate * with the hypervisor to defer flushing on preempted vCPUs. * * Such systems need to delay the freeing by some other means, this is that * means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } #ifdef CONFIG_PT_RECLAIM static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) { struct ptdesc *ptdesc; ptdesc = container_of(head, struct ptdesc, pt_rcu_head); __tlb_remove_table(ptdesc); } static inline void __tlb_remove_table_one(void *table) { struct ptdesc *ptdesc; ptdesc = table; call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); } #else static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ static void tlb_remove_table_one(void *table) { __tlb_remove_table_one(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif tlb->vma_pfn = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); }
15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 /* * llc_c_ev.c - Connection component state transition event qualifiers * * A 'state' consists of a number of possible event matching functions, * the actions associated with each being executed when that event is * matched; a 'state machine' accepts events in a serial fashion from an * event queue. Each event is passed to each successive event matching * function until a match is made (the event matching function returns * success, or '0') or the list of event matching functions is exhausted. * If a match is made, the actions associated with the event are executed * and the state is changed to that event's transition state. Before some * events are recognized, even after a match has been made, a certain * number of 'event qualifier' functions must also be executed. If these * all execute successfully, then the event is finally executed. * * These event functions must return 0 for success, to show a matched * event, of 1 if the event does not match. Event qualifier functions * must return a 0 for success or a non-zero for failure. Each function * is simply responsible for verifying one single thing and returning * either a success or failure. * * All of followed event functions are described in 802.2 LLC Protocol * standard document except two functions that we added that will explain * in their comments, at below. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_pdu.h> #if 1 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /** * llc_util_ns_inside_rx_window - check if sequence number is in rx window * @ns: sequence number of received pdu. * @vr: sequence number which receiver expects to receive. * @rw: receive window size of receiver. * * Checks if sequence number of received PDU is in range of receive * window. Returns 0 for success, 1 otherwise */ static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) { return !llc_circular_between(vr, ns, (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); } /** * llc_util_nr_inside_tx_window - check if sequence number is in tx window * @sk: current connection. * @nr: N(R) of received PDU. * * This routine checks if N(R) of received PDU is in range of transmit * window; on the other hand checks if received PDU acknowledges some * outstanding PDUs that are in transmit window. Returns 0 for success, 1 * otherwise. */ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) { u8 nr1, nr2; struct sk_buff *skb; struct llc_pdu_sn *pdu; struct llc_sock *llc = llc_sk(sk); int rc = 0; if (llc->dev->flags & IFF_LOOPBACK) goto out; rc = 1; if (skb_queue_empty(&llc->pdu_unack_q)) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr1 = LLC_I_GET_NS(pdu); skb = skb_peek_tail(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr2 = LLC_I_GET_NS(pdu); rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); out: return rc; } int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_CONN_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DATA_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DISC_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_RESET_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; } int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; } int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; } int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; } int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; } int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; } int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { if (LLC_I_PF_IS_1(pdu)) rc = 0; } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) rc = 0; } return rc; } int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_CMD(pdu)) { case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: rc = 0; break; } } return rc; } int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_RSP(pdu)) { case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: rc = 0; break; } } return rc; } int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_CMD(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); rc = 0; } return rc; } int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_RSP(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { rc = 0; dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); } return rc; } int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) { return 0; } int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_P_TMR; } int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; } int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; } int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; } int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; } /* Event qualifier functions * * these functions simply verify the value of a state flag associated with * the connection and return either a 0 for success or a non-zero value * for not-success; verify the event is the type we expect */ int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 1; } int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag; } int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 2; } int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag != 1; } /** * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, is last frame of * transmit window, if it is then this function return zero else return * one. This function is used for sending last frame of transmit window * as I-format command with p-bit set to one. Returns 0 if frame is last * frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) { return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); } /** * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, isn't last frame of * transmit window, if it isn't then this function return zero else return * one. Returns 0 if frame isn't last frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) { return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; } int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag; } int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); return llc_sk(sk)->p_flag == f_bit ? 0 : 1; } int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); } int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); } int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONN; return 0; } int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_DISC; return 0; } int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_FAILED; return 0; } int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REMOTE_BUSY; return 0; } int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REFUSE; return 0; } int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONFLICT; return 0; } int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_RESET_DONE; return 0; }
7 1 7 7 6 7 7 7 7 6 13 13 26 26 48 13 2 45 45 43 1 45 45 45 42 42 42 42 46 1 45 6 6 2 2 4 13 17 10 1 1 1 2 2 2 289 6 281 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc **queues; }; static struct Qdisc * multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif band = skb_get_queue_mapping(skb); if (band >= q->bands) return q->queues[0]; return q->queues[band]; } static int multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; qdisc = multiq_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *multiq_dequeue(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ q->curband++; if (q->curband >= q->bands) q->curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } } } return NULL; } static struct sk_buff *multiq_peek(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned int curband = q->curband; struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ curband++; if (curband >= q->bands) curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) return skb; } } return NULL; } static void multiq_reset(struct Qdisc *sch) { u16 band; struct multiq_sched_data *q = qdisc_priv(sch); for (band = 0; band < q->bands; band++) qdisc_reset(q->queues[band]); q->curband = 0; } static void multiq_destroy(struct Qdisc *sch) { int band; struct multiq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_put(q->queues[band]); kfree(q->queues); } static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; struct Qdisc **removed; int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); qopt->bands = qdisc_dev(sch)->real_num_tx_queues; removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; qdisc_purge_queue(child); removed[n_removed++] = child; } } sch_tree_unlock(sch); for (i = 0; i < n_removed; i++) qdisc_put(removed[i]); kfree(removed); for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (child) { sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; if (child != &noop_qdisc) qdisc_hash_add(child, true); if (old != &noop_qdisc) qdisc_purge_queue(old); sch_tree_unlock(sch); qdisc_put(old); } } } return 0; } static int multiq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); int i, err; q->queues = NULL; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); if (!q->queues) return -ENOBUFS; for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; return multiq_tune(sch, opt, extack); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_multiq_qopt opt; opt.bands = q->bands; opt.max_bands = q->max_bands; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } static struct Qdisc * multiq_leaf(struct Qdisc *sch, unsigned long arg) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long multiq_find(struct Qdisc *sch, u32 classid) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return multiq_find(sch, classid); } static void multiq_unbind(struct Qdisc *q, unsigned long cl) { } static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct multiq_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl - 1]->handle; return 0; } static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct multiq_sched_data *q = qdisc_priv(sch); int band; if (arg->stop) return; for (band = 0; band < q->bands; band++) { if (!tc_qdisc_stats_dump(sch, band + 1, arg)) break; } } static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { .graft = multiq_graft, .leaf = multiq_leaf, .find = multiq_find, .walk = multiq_walk, .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_unbind, .dump = multiq_dump_class, .dump_stats = multiq_dump_class_stats, }; static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &multiq_class_ops, .id = "multiq", .priv_size = sizeof(struct multiq_sched_data), .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, .change = multiq_tune, .dump = multiq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { return register_qdisc(&multiq_qdisc_ops); } static void __exit multiq_module_exit(void) { unregister_qdisc(&multiq_qdisc_ops); } module_init(multiq_module_init) module_exit(multiq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc");
15 15 20 20 20 14 20 9 9 19 2 1 1 20 21 19 7 2 6 9 18 11 9 18 15 7 8 4 11 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0 /* * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum * * Copyright 2018 Google LLC */ /* * "NHPoly1305" is the main component of Adiantum hashing. * Specifically, it is the calculation * * H_L ← Poly1305_{K_L}(NH_{K_N}(pad_{128}(L))) * * from the procedure in section 6.4 of the Adiantum paper [1]. It is an * ε-almost-∆-universal (ε-∆U) hash function for equal-length inputs over * Z/(2^{128}Z), where the "∆" operation is addition. It hashes 1024-byte * chunks of the input with the NH hash function [2], reducing the input length * by 32x. The resulting NH digests are evaluated as a polynomial in * GF(2^{130}-5), like in the Poly1305 MAC [3]. Note that the polynomial * evaluation by itself would suffice to achieve the ε-∆U property; NH is used * for performance since it's over twice as fast as Poly1305. * * This is *not* a cryptographic hash function; do not use it as such! * * [1] Adiantum: length-preserving encryption for entry-level processors * (https://eprint.iacr.org/2018/720.pdf) * [2] UMAC: Fast and Secure Message Authentication * (https://fastcrypto.org/umac/umac_proc.pdf) * [3] The Poly1305-AES message-authentication code * (https://cr.yp.to/mac/poly1305-20050329.pdf) */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/internal/hash.h> #include <crypto/internal/poly1305.h> #include <crypto/nhpoly1305.h> #include <linux/crypto.h> #include <linux/kernel.h> #include <linux/module.h> static void nh_generic(const u32 *key, const u8 *message, size_t message_len, __le64 hash[NH_NUM_PASSES]) { u64 sums[4] = { 0, 0, 0, 0 }; BUILD_BUG_ON(NH_PAIR_STRIDE != 2); BUILD_BUG_ON(NH_NUM_PASSES != 4); while (message_len) { u32 m0 = get_unaligned_le32(message + 0); u32 m1 = get_unaligned_le32(message + 4); u32 m2 = get_unaligned_le32(message + 8); u32 m3 = get_unaligned_le32(message + 12); sums[0] += (u64)(u32)(m0 + key[ 0]) * (u32)(m2 + key[ 2]); sums[1] += (u64)(u32)(m0 + key[ 4]) * (u32)(m2 + key[ 6]); sums[2] += (u64)(u32)(m0 + key[ 8]) * (u32)(m2 + key[10]); sums[3] += (u64)(u32)(m0 + key[12]) * (u32)(m2 + key[14]); sums[0] += (u64)(u32)(m1 + key[ 1]) * (u32)(m3 + key[ 3]); sums[1] += (u64)(u32)(m1 + key[ 5]) * (u32)(m3 + key[ 7]); sums[2] += (u64)(u32)(m1 + key[ 9]) * (u32)(m3 + key[11]); sums[3] += (u64)(u32)(m1 + key[13]) * (u32)(m3 + key[15]); key += NH_MESSAGE_UNIT / sizeof(key[0]); message += NH_MESSAGE_UNIT; message_len -= NH_MESSAGE_UNIT; } hash[0] = cpu_to_le64(sums[0]); hash[1] = cpu_to_le64(sums[1]); hash[2] = cpu_to_le64(sums[2]); hash[3] = cpu_to_le64(sums[3]); } /* Pass the next NH hash value through Poly1305 */ static void process_nh_hash_value(struct nhpoly1305_state *state, const struct nhpoly1305_key *key) { BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0); poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash, NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1); } /* * Feed the next portion of the source data, as a whole number of 16-byte * "NH message units", through NH and Poly1305. Each NH hash is taken over * 1024 bytes, except possibly the final one which is taken over a multiple of * 16 bytes up to 1024. Also, in the case where data is passed in misaligned * chunks, we combine partial hashes; the end result is the same either way. */ static void nhpoly1305_units(struct nhpoly1305_state *state, const struct nhpoly1305_key *key, const u8 *src, unsigned int srclen, nh_t nh_fn) { do { unsigned int bytes; if (state->nh_remaining == 0) { /* Starting a new NH message */ bytes = min_t(unsigned int, srclen, NH_MESSAGE_BYTES); nh_fn(key->nh_key, src, bytes, state->nh_hash); state->nh_remaining = NH_MESSAGE_BYTES - bytes; } else { /* Continuing a previous NH message */ __le64 tmp_hash[NH_NUM_PASSES]; unsigned int pos; int i; pos = NH_MESSAGE_BYTES - state->nh_remaining; bytes = min(srclen, state->nh_remaining); nh_fn(&key->nh_key[pos / 4], src, bytes, tmp_hash); for (i = 0; i < NH_NUM_PASSES; i++) le64_add_cpu(&state->nh_hash[i], le64_to_cpu(tmp_hash[i])); state->nh_remaining -= bytes; } if (state->nh_remaining == 0) process_nh_hash_value(state, key); src += bytes; srclen -= bytes; } while (srclen); } int crypto_nhpoly1305_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct nhpoly1305_key *ctx = crypto_shash_ctx(tfm); int i; if (keylen != NHPOLY1305_KEY_SIZE) return -EINVAL; poly1305_core_setkey(&ctx->poly_key, key); key += POLY1305_BLOCK_SIZE; for (i = 0; i < NH_KEY_WORDS; i++) ctx->nh_key[i] = get_unaligned_le32(key + i * sizeof(u32)); return 0; } EXPORT_SYMBOL(crypto_nhpoly1305_setkey); int crypto_nhpoly1305_init(struct shash_desc *desc) { struct nhpoly1305_state *state = shash_desc_ctx(desc); poly1305_core_init(&state->poly_state); state->buflen = 0; state->nh_remaining = 0; return 0; } EXPORT_SYMBOL(crypto_nhpoly1305_init); int crypto_nhpoly1305_update_helper(struct shash_desc *desc, const u8 *src, unsigned int srclen, nh_t nh_fn) { struct nhpoly1305_state *state = shash_desc_ctx(desc); const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm); unsigned int bytes; if (state->buflen) { bytes = min(srclen, (int)NH_MESSAGE_UNIT - state->buflen); memcpy(&state->buffer[state->buflen], src, bytes); state->buflen += bytes; if (state->buflen < NH_MESSAGE_UNIT) return 0; nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT, nh_fn); state->buflen = 0; src += bytes; srclen -= bytes; } if (srclen >= NH_MESSAGE_UNIT) { bytes = round_down(srclen, NH_MESSAGE_UNIT); nhpoly1305_units(state, key, src, bytes, nh_fn); src += bytes; srclen -= bytes; } if (srclen) { memcpy(state->buffer, src, srclen); state->buflen = srclen; } return 0; } EXPORT_SYMBOL(crypto_nhpoly1305_update_helper); int crypto_nhpoly1305_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { return crypto_nhpoly1305_update_helper(desc, src, srclen, nh_generic); } EXPORT_SYMBOL(crypto_nhpoly1305_update); int crypto_nhpoly1305_final_helper(struct shash_desc *desc, u8 *dst, nh_t nh_fn) { struct nhpoly1305_state *state = shash_desc_ctx(desc); const struct nhpoly1305_key *key = crypto_shash_ctx(desc->tfm); if (state->buflen) { memset(&state->buffer[state->buflen], 0, NH_MESSAGE_UNIT - state->buflen); nhpoly1305_units(state, key, state->buffer, NH_MESSAGE_UNIT, nh_fn); } if (state->nh_remaining) process_nh_hash_value(state, key); poly1305_core_emit(&state->poly_state, NULL, dst); return 0; } EXPORT_SYMBOL(crypto_nhpoly1305_final_helper); int crypto_nhpoly1305_final(struct shash_desc *desc, u8 *dst) { return crypto_nhpoly1305_final_helper(desc, dst, nh_generic); } EXPORT_SYMBOL(crypto_nhpoly1305_final); static struct shash_alg nhpoly1305_alg = { .base.cra_name = "nhpoly1305", .base.cra_driver_name = "nhpoly1305-generic", .base.cra_priority = 100, .base.cra_ctxsize = sizeof(struct nhpoly1305_key), .base.cra_module = THIS_MODULE, .digestsize = POLY1305_DIGEST_SIZE, .init = crypto_nhpoly1305_init, .update = crypto_nhpoly1305_update, .final = crypto_nhpoly1305_final, .setkey = crypto_nhpoly1305_setkey, .descsize = sizeof(struct nhpoly1305_state), }; static int __init nhpoly1305_mod_init(void) { return crypto_register_shash(&nhpoly1305_alg); } static void __exit nhpoly1305_mod_exit(void) { crypto_unregister_shash(&nhpoly1305_alg); } module_init(nhpoly1305_mod_init); module_exit(nhpoly1305_mod_exit); MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); MODULE_ALIAS_CRYPTO("nhpoly1305"); MODULE_ALIAS_CRYPTO("nhpoly1305-generic");
6 1 18 18 10 9 20 10 9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 /* * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_INT_H #define _TLS_INT_H #include <asm/byteorder.h> #include <linux/types.h> #include <linux/skmsg.h> #include <net/tls.h> #include <net/tls_prot.h> #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) #define __TLS_INC_STATS(net, field) \ __SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_INC_STATS(net, field) \ SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_DEC_STATS(net, field) \ SNMP_DEC_STATS((net)->mib.tls_statistics, field) struct tls_cipher_desc { unsigned int nonce; unsigned int iv; unsigned int key; unsigned int salt; unsigned int tag; unsigned int rec_seq; unsigned int iv_offset; unsigned int key_offset; unsigned int salt_offset; unsigned int rec_seq_offset; char *cipher_name; bool offloadable; size_t crypto_info; }; #define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 #define TLS_CIPHER_MAX TLS_CIPHER_ARIA_GCM_256 extern const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type) { if (cipher_type < TLS_CIPHER_MIN || cipher_type > TLS_CIPHER_MAX) return NULL; return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN]; } static inline char *crypto_info_iv(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->iv_offset; } static inline char *crypto_info_key(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->key_offset; } static inline char *crypto_info_salt(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->salt_offset; } static inline char *crypto_info_rec_seq(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->rec_seq_offset; } /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are * stores in a linked list. */ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; struct sk_msg msg_plaintext; struct sk_msg msg_encrypted; /* AAD | msg_plaintext.sg.data | sg_tag */ struct scatterlist sg_aead_in[2]; /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ struct scatterlist sg_aead_out[2]; char content_type; struct scatterlist sg_content_type; struct sock *sk; char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; }; int __net_init tls_proc_init(struct net *net); void __net_exit tls_proc_fini(struct net *net); struct tls_context *tls_ctx_create(struct sock *sk); void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); int tls_set_sw_offload(struct sock *sk, int tx, struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_sw_splice_eof(struct socket *sock); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); int tls_tx_records(struct sock *sk, int flags); void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); int tls_process_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); int decrypt_skb(struct sock *sk, struct scatterlist *sgout); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); int tls_strp_dev_init(void); void tls_strp_dev_exit(void); void tls_strp_done(struct tls_strparser *strp); void tls_strp_stop(struct tls_strparser *strp); int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); void tls_strp_check_rcv(struct tls_strparser *strp); void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); void tls_rx_msg_ready(struct tls_strparser *strp); bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); static inline struct tls_msg *tls_msg(struct sk_buff *skb) { struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb; return &scb->tls; } static inline struct sk_buff *tls_strp_msg(struct tls_sw_context_rx *ctx) { DEBUG_NET_WARN_ON_ONCE(!ctx->strp.msg_ready || !ctx->strp.anchor->len); return ctx->strp.anchor; } static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx) { return READ_ONCE(ctx->strp.msg_ready); } static inline bool tls_strp_msg_mixed_decrypted(struct tls_sw_context_rx *ctx) { return ctx->strp.mixed_decrypted; } #ifdef CONFIG_TLS_DEVICE int tls_device_init(void); void tls_device_cleanup(void); int tls_set_device_offload(struct sock *sk); void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx); #else static inline int tls_device_init(void) { return 0; } static inline void tls_device_cleanup(void) {} static inline int tls_set_device_offload(struct sock *sk) { return -EOPNOTSUPP; } static inline void tls_device_free_resources_tx(struct sock *sk) {} static inline int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { return -EOPNOTSUPP; } static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} static inline void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} static inline int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { return 0; } #endif int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, int flags); void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); static inline bool tls_is_partially_sent_record(struct tls_context *ctx) { return !!ctx->partially_sent_record; } static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) { return tls_ctx->pending_open_record_frags; } static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; for (i = len - 1; i >= 0; i--) { ++seq[i]; if (seq[i] != 0) break; } return (i == -1); } static inline void tls_bigint_subtract(unsigned char *seq, int n) { u64 rcd_sn; __be64 *p; BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8); p = (__be64 *)seq; rcd_sn = be64_to_cpu(*p); *p = cpu_to_be64(rcd_sn - n); } static inline void tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) tls_bigint_increment(ctx->iv + prot->salt_size, prot->iv_size); } static inline void tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq) { int i; if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { for (i = 0; i < 8; i++) iv[i + 4] ^= seq[i]; } } static inline void tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; size_t pkt_len, iv_size = prot->iv_size; pkt_len = plaintext_len + prot->tag_size; if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) { pkt_len += iv_size; memcpy(buf + TLS_NONCE_OFFSET, ctx->tx.iv + prot->salt_size, iv_size); } /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ buf[1] = TLS_1_2_VERSION_MINOR; buf[2] = TLS_1_2_VERSION_MAJOR; /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; } static inline void tls_make_aad(char *buf, size_t size, char *record_sequence, unsigned char record_type, struct tls_prot_info *prot) { if (prot->version != TLS_1_3_VERSION) { memcpy(buf, record_sequence, prot->rec_seq_size); buf += 8; } else { size += prot->tag_size; } buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; buf[1] = TLS_1_2_VERSION_MAJOR; buf[2] = TLS_1_2_VERSION_MINOR; buf[3] = size >> 8; buf[4] = size & 0xFF; } #endif
8 803 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(dev, key); neigh_confirm(n); rcu_read_unlock(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
33 219 3 1039 1032 605 540 137 137 539 137 45 164 198 199 200 195 12 169 170 35 36 29 20 11 9 796 796 3 3 3 3 3 11 11 11 391 20 21 21 1 1 1476 1474 14 14 1 1 14 14 1 23 31 2 2 1 2 54 59 59 53 3 54 6 2 46 46 1379 79 80 6 63 3 21 5 62 8 78 79 23 66 80 1463 1370 753 759 794 81 1465 1470 1462 25 1400 171 1410 1 1464 1470 1463 1464 29 1466 13 1 1467 1460 10 1565 864 1470 1460 1463 1453 26 21 1 19 3 2 16 18 4 4 4 6 3 7 627 215 213 104 104 37 1 75 104 217 325 307 771 832 832 833 830 806 42 832 963 951 207 204 31 207 25 25 80 80 80 112 112 112 112 5 93 17 64 1 112 2 109 59 52 70 89 118 42 41 42 29 12 1 13 721 721 727 719 721 715 9 1375 8 8 727 723 722 3 3 3 149 149 1 6 6 6 1468 1462 1463 144 129 9 9 9 9 698 1497 9 9 8 1 9 6 6 6 6 650 651 649 6 6 6 3 6 4 633 4 4 634 1463 1465 1466 20 1465 3 3 3 3 3 55 55 5 4 1 2 34 35 120 110 36 36 36 36 35 36 119 120 1570 1574 1417 200 866 1543 1547 1540 19 1370 33 1375 727 1535 1522 10 1533 707 709 401 135 4 181 79 310 32 400 398 1 697 701 352 690 697 701 975 571 975 277 828 30 730 972 975 977 974 975 27 27 391 381 391 17 392 405 405 405 402 392 405 116 115 115 3 116 47 79 80 115 8 8 8 8 8 8 8 8 8 15 15 3 12 5 10 8 8 8 8 5 3 8 3 3 15 15 5 12 3 3 3 3 3 1 54 54 54 8 5 4 14 14 14 14 12 14 2 4 14 7 13 1 13 13 145 145 145 145 145 779 787 188 189 189 15 14 2 10 6 6 1 3 12 10 4 5 14 4 8 64 29 10 25 10 15 25 24 2 3 43 1 31 682 797 7 6 9 775 5 13 12 78 48 28 3 3 30 30 731 64 693 49 693 9 694 722 752 766 788 122 19 19 19 19 19 19 11 8 8 722 1 2 2 8 2 708 1 765 711 52 5 716 4 727 35 727 43 672 92 760 5 5 2 3 3 106 697 698 3 1 108 5 5 706 13 3 100 629 626 1 624 626 619 4 7 2 5 7 7 4 19 2 19 3 17 1 13 4 4 13 12 10 10 9 2 7 15 14 14 14 1 2 2 9 4 2 1 1 2 1 1 1 1 1 6 6 6 1 2 1 6 6 6 6 44 45 45 37 25 30 45 21 5 21 54 25 21 9 118 118 118 118 52 54 12 106 118 66 52 52 587 586 12 2 12 12 12 2 2 2 2 2 6 6 4 3 1 4 6 1 6 1 5 6 1 6 1 6 6 6 6 696 691 6 6 6 704 705 682 704 578 569 1 19 19 1 1 1 1 1 1 1 1 1 1 1 1154 1153 784 1 523 19 1 1 219 43 219 1154 1038 1037 1039 175 175 55 55 55 175 175 14 175 175 29 1 1 17 7 6 1 27 2 26 120 2 3 2 1 86 26 109 3 104 7 99 3 2 5 7 83 24 1 98 2 5 99 2 2 99 3 79 23 96 6 90 12 96 6 73 29 94 2 73 24 68 29 87 4 4 10 1 9 6 6 5 1 6 6 6 6 6 6 5 1 1 6 3 16 10 3 14 11 3 2 5 1 13 10 6 4 6 6 3 16 10 10 5 4 4 4 3 1 25 11 12 2 2 2 5 7 95 14 5 76 19 62 5 9 1111 5 1108 1106 4 4 4 4 1109 6 6 1150 11 1149 3 1149 1 16 157 1073 16 1131 6 1132 8 1143 1145 1 15 15 4 4 4 4 5 6 13 1125 1122 36 1 1131 1104 509 1130 1 9 1 1145 16 1 4 6 1 1 1 1 16 16 16 2 2 1 32 32 31 22 15 14 5 11 15 4 22 10 15 16 16 16 1 24 30 28 7 10 1 2 17 11 2 14 1 14 14 12 5 55 55 26 25 2 3 15 2 11 14 12 24 2 23 3 4 22 24 2 24 2 22 4 25 11 1 10 10 14 21 16 17 1 16 3 14 1108 1110 1111 1109 1 1 1 23 23 23 23 2779 2712 349 194 194 194 194 194 194 194 194 194 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * FIB front-end. * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ /* Changes: * * YOSHIFUJI Hideaki @USAGI * reworked default router selection. * - respect outgoing interface * - select from (probably) reachable routers (i.e. * routers in REACHABLE, STALE, DELAY or PROBE states). * - always select the same router if it is (probably) * reachable. otherwise, round-robin the list. * Ville Nuorvala * Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/capability.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/init.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/jhash.h> #include <linux/siphash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/tcp.h> #include <linux/rtnetlink.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> #include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static int ip6_rt_type_to_error(u8 fib6_type); #define CREATE_TRACE_POINTS #include <trace/events/fib6.h> EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); #undef CREATE_TRACE_POINTS enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); #endif struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void rt6_uncached_list_flush_dev(struct net_device *dev) { int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt, *safe; if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; bool handled = false; if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; } if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; netdev_ref_replace(rt_dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); handled = true; } if (handled) list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) return &ipv6_hdr(skb)->daddr; return daddr; } struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr) { struct neighbour *n; daddr = choose_neigh_daddr(gw, skb, daddr); n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; n = neigh_create(&nd_tbl, daddr, dev); return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) return; if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) return; __ipv6_confirm_neigh(dev, daddr); } static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .default_advmss = ip6_default_advmss, .neigh_lookup = ip6_dst_neigh_lookup, .check = ip6_dst_check, .destroy = ip6_dst_destroy, .cow_metrics = dst_cow_metrics_generic, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; static const struct rt6_info ip6_null_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES static const struct rt6_info ip6_prohibit_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #endif static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); } /* allocate dst with ip6_dst_ops */ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); } return rt; } EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); if (blackhole_idev) { rt->rt6i_idev = blackhole_idev; in6_dev_put(idev); } } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, READ_ONCE(rt->dst.expires)); return false; } static bool rt6_check_expired(const struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; } static struct fib6_info * rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; iter = rcu_dereference(fn->leaf); if (!iter) goto out; while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference(iter->fib6_next); } out: return NULL; } void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; if (match->nh && have_oif_match && res->nh) return; if (skb) IP6CB(skb)->flags |= IP6SKB_MULTIPATH; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash && (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (unlikely(match->nh)) { nexthop_path_fib6_result(res, fl6->mp_hash); return; } first = rt6_multipath_first_sibling_rcu(match); if (!first) goto out; hash = fl6->mp_hash; if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) match = first; goto out; } list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } out: res->f6i = match; res->nh = match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, const struct in6_addr *saddr, int oif, int flags) { const struct net_device *dev; if (nh->fib_nh_flags & RTNH_F_DEAD) return false; dev = nh->fib_nh_dev; if (oif) { if (dev->ifindex == oif) return true; } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) return true; } return false; } struct fib6_nh_dm_arg { struct net *net; const struct in6_addr *saddr; int oif; int flags; struct fib6_nh *nh; }; static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_dm_arg *arg = _arg; arg->nh = nh; return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, arg->flags); } /* returns fib6_nh from nexthop or NULL */ static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_nh_dm_arg arg = { .net = net, .saddr = saddr, .oif = oif, .flags = flags, }; if (nexthop_is_blackhole(nh)) return NULL; if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) return arg.nh; return NULL; } static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_info *f6i = res->f6i; struct fib6_info *spf6i; struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { bool matched = false; if (unlikely(spf6i->nh)) { nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, oif, flags); if (nh) matched = true; } else { nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) matched = true; } if (matched) { res->f6i = spf6i; goto out; } } if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; goto out; } if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; return; out_blackhole: res->fib6_flags |= RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF struct __rt6_probe_work { struct work_struct work; struct in6_addr target; struct net_device *dev; netdevice_tracker dev_tracker; }; static void rt6_probe_deferred(struct work_struct *w) { struct in6_addr mcaddr; struct __rt6_probe_work *work = container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); netdev_put(work->dev, &work->dev_tracker); kfree(work); } static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it * is really so; aka Router Reachability Probing. * * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); if (!idev) goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out; write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (!work || cmpxchg(&fib6_nh->last_probe, last_probe, jiffies) != last_probe) { kfree(work); } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); work->dev = dev; schedule_work(&work->work); } out: rcu_read_unlock(); } #else static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif /* * Default Router Selection (RFC 2461 6.3.6) */ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; rcu_read_lock(); neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, &fib6_nh->fib_nh_gw6); if (neigh) { u8 nud_state = READ_ONCE(neigh->nud_state); if (nud_state & NUD_VALID) ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; else ret = RT6_NUD_FAIL_PROBE; #endif } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock(); return ret; } static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict) { int m = 0; if (!oif || nh->fib_nh_dev->ifindex == oif) m = 2; if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif if ((strict & RT6_LOOKUP_F_REACHABLE) && !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } static bool find_match(struct fib6_nh *nh, u32 fib6_flags, int oif, int strict, int *mpri, bool *do_rr) { bool match_do_rr = false; bool rc = false; int m; if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; if (ip6_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; rc = true; } out: return rc; } struct fib6_nh_frl_arg { u32 flags; int oif; int strict; int *mpri; bool *do_rr; struct fib6_nh *nh; }; static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_frl_arg *arg = _arg; arg->nh = nh; return find_match(nh, arg->flags, arg->oif, arg->strict, arg->mpri, arg->do_rr); } static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, int oif, int strict, bool *do_rr, int *mpri) { struct fib6_info *f6i; for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { *cont = f6i; return; } if (fib6_check_expired(f6i)) continue; if (unlikely(f6i->nh)) { struct fib6_nh_frl_arg arg = { .flags = f6i->fib6_flags, .oif = oif, .strict = strict, .mpri = mpri, .do_rr = do_rr }; if (nexthop_is_blackhole(f6i->nh)) { res->fib6_flags = RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->f6i = f6i; res->nh = nexthop_fib6_nh(f6i->nh); return; } if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, &arg)) { matched = true; nh = arg.nh; } } else { nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) matched = true; } if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; res->fib6_type = f6i->fib6_type; } } } static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, struct fib6_info *rr_head, int oif, int strict, bool *do_rr, struct fib6_result *res) { u32 metric = rr_head->fib6_metric; struct fib6_info *cont = NULL; int mpri = -1; __find_rr_leaf(rr_head, NULL, metric, res, &cont, oif, strict, do_rr, &mpri); __find_rr_leaf(leaf, rr_head, metric, res, &cont, oif, strict, do_rr, &mpri); if (res->f6i || !cont) return; __find_rr_leaf(cont, NULL, metric, res, NULL, oif, strict, do_rr, &mpri); } static void rt6_select(struct net *net, struct fib6_node *fn, int oif, struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct fib6_info *rt0; bool do_rr = false; int key_plen; /* make sure this function or its helpers sets f6i */ res->f6i = NULL; if (!leaf || leaf == net->ipv6.fib6_null_entry) goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES if (rt0->fib6_src.plen) key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) goto out; find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } } static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { return (res->f6i->fib6_flags & RTF_NONEXTHOP) || res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) { struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; } /* Sanity check for prefix_len and length */ if (rinfo->length > 3) { return -EINVAL; } else if (rinfo->prefix_len > 128) { return -EINVAL; } else if (rinfo->prefix_len > 64) { if (rinfo->length < 2) { return -EINVAL; } } else if (rinfo->prefix_len > 0) { if (rinfo->length < 1) { return -EINVAL; } } pref = rinfo->route_pref; if (pref == ICMPV6_ROUTER_PREF_INVALID) return -EINVAL; lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; else { /* this function is safe */ ipv6_addr_prefix(&prefix_buf, (struct in6_addr *)rinfo->prefix, rinfo->prefix_len); prefix = &prefix_buf; } if (rinfo->prefix_len == 0) rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { ip6_del_rt(net, rt, false); rt = NULL; } if (!rt && lifetime) rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) rt->fib6_flags = RTF_ROUTEINFO | (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); fib6_remove_gc_list(rt); } else { fib6_set_expires(rt, jiffies + HZ * lifetime); fib6_add_gc_list(rt); } spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } return 0; } #endif /* * Misc support functions */ /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which * case we want dev returned to be dev */ } return dev; } static const int fib6_prop[RTN_MAX + 1] = { [RTN_UNSPEC] = 0, [RTN_UNICAST] = 0, [RTN_LOCAL] = 0, [RTN_BROADCAST] = 0, [RTN_ANYCAST] = 0, [RTN_MULTICAST] = 0, [RTN_BLACKHOLE] = -EINVAL, [RTN_UNREACHABLE] = -EHOSTUNREACH, [RTN_PROHIBIT] = -EACCES, [RTN_THROW] = -EAGAIN, [RTN_NAT] = -EINVAL, [RTN_XRESOLVE] = -EINVAL, }; static int ip6_rt_type_to_error(u8 fib6_type) { return fib6_prop[fib6_type]; } static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; if (rt->dst_nocount) flags |= DST_NOCOUNT; if (rt->dst_nopolicy) flags |= DST_NOPOLICY; return flags; } static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { rt->dst.error = ip6_rt_type_to_error(fib6_type); switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: rt->dst.output = ip6_pkt_prohibit_out; rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: case RTN_UNREACHABLE: default: rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; } } static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; if (res->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } if (res->nh->fib_nh_lws) { rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } rt->dst.lastuse = jiffies; } /* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to f6i in result */ static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; const struct net_device *dev = nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; ip6_rt_init_dst(rt, res); rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_flags = res->fib6_flags; if (nh->fib_nh_gw_family) { rt->rt6i_gateway = nh->fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = f6i->fib6_src; #endif } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) return fn; } } static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { rt = NULL; } *prt = rt; return false; } /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; unsigned short flags; struct rt6_info *nrt; if (!fib6_info_hold_safe(f6i)) goto fallback; flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { fib6_info_release(f6i); goto fallback; } ip6_rt_copy_init(nrt, res); return nrt; fallback: nrt = dev_net(dev)->ipv6.ip6_null_entry; dst_hold(&nrt->dst); return nrt; } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: res.f6i = rcu_dereference(fn->leaf); if (!res.f6i) res.f6i = net->ipv6.fib6_null_entry; else rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, flags); if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; } else if (res.fib6_flags & RTF_REJECT) { goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { do_create: rt = ip6_create_rt_rcu(&res); } out: trace_fib6_table_lookup(net, &res, table, fl6); rcu_read_unlock(); return rt; } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return dst_rt6_info(dst); dst_release(dst); return NULL; } EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. * It takes new route entry, the addition fails by any reason the * route is released. * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; } int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; /* * Clone the route. */ if (!fib6_info_hold_safe(f6i)) return NULL; dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(res)) { if (f6i->fib6_dst.plen != 128 && ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif } return rt; } static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; if (f6i->nh) pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); return pcpu_rt; } static bool rt6_is_valid(const struct rt6_info *rt6) { return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); } /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt; pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); dst_release(&prev->dst); } pcpu_rt = NULL; } return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) return NULL; p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); if (res->f6i->fib6_destroying) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } return pcpu_rt; } /* exception hash table implementation */ static DEFINE_SPINLOCK(rt6_exception_lock); /* Remove rt6_ex from hash table and free the memory * Caller must hold rt6_exception_lock */ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); net->ipv6.rt6_stats->fib_rt_cache--; /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; } /* Remove oldest rt6_ex in bucket and free the memory * Caller must hold rt6_exception_lock */ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) { struct rt6_exception *rt6_ex, *oldest = NULL; if (!bucket) return; hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) oldest = rt6_ex; } rt6_remove_exception(bucket, oldest); } static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { static siphash_aligned_key_t rt6_exception_key; struct { struct in6_addr dst; struct in6_addr src; } __aligned(SIPHASH_ALIGNMENT) combined = { .dst = *dst, }; u64 val; net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) combined.src = *src; #endif val = siphash(&combined, sizeof(combined), &rt6_exception_key); return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rt6_exception_lock */ static struct rt6_exception * __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rcu_read_lock() */ static struct rt6_exception * __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; WARN_ON_ONCE(!rcu_read_lock_held()); if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } static unsigned int fib6_mtu(const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; unsigned int mtu; if (res->f6i->fib6_pmtu) { mtu = res->f6i->fib6_pmtu; } else { struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL /* used when the flushed bit is not relevant, only access to the bucket * (ie., all bucket users except rt6_insert_exception); * * called under rcu lock; sometimes called with rt6_exception_lock held */ static struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; if (lock) bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); else bucket = rcu_dereference(nh->rt6i_exception_bucket); /* remove bucket flushed bit if set */ if (bucket) { unsigned long p = (unsigned long)bucket; p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; } return bucket; } static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) { unsigned long p = (unsigned long)bucket; return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); } /* called with rt6_exception_lock held */ static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; unsigned long p; bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); p = (unsigned long)bucket; p |= FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); if (!bucket) { err = -ENOMEM; goto out; } rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } else if (fib6_nh_excptn_bucket_flushed(bucket)) { err = -EINVAL; goto out; } #ifdef CONFIG_IPV6_SUBTREES /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by * a hash of only fib6_dst. */ if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu * is less than f6i's mtu value. */ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_remove_exception(bucket, rt6_ex); rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); if (!rt6_ex) { err = -ENOMEM; goto out; } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; /* Randomize max depth to avoid some side channels attacks. */ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } return err; } static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; /* Prevent rt6_insert_exception() to recreate the bucket list */ if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from) rt6_remove_exception(bucket, rt6_ex); } WARN_ON_ONCE(!from && bucket->depth); bucket++; } out: spin_unlock_bh(&rt6_exception_lock); } static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_info *f6i = arg; fib6_nh_flush_exceptions(nh, f6i); return 0; } void rt6_flush_exceptions(struct fib6_info *f6i) { if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); rcu_read_unlock(); } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); } } /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL; #ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * However, the src addr used to create the hash * might not be exactly the passed in saddr which * is a /128 addr from the flow. * So we need to use f6i->fib6_src to redo lookup * if the passed in saddr does not find anything. * (See the logic in ip6_rt_cache_alloc() on how * rt->rt6i_src is updated.) */ if (res->f6i->fib6_src.plen) src_key = saddr; find_ex: #endif bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) ret = rt6_ex->rt6i; #ifdef CONFIG_IPV6_SUBTREES /* Use fib6_src as src_key and redo lookup */ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { src_key = &res->f6i->fib6_src.addr; goto find_ex; } #endif return ret; } /* Remove the passed in cached rt from the hash table that contains it */ static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int err; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) { rt6_remove_exception(bucket, rt6_ex); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&rt6_exception_lock); return err; } struct fib6_nh_excptn_arg { struct rt6_info *rt; int plen; }; static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_excptn_arg *arg = _arg; int err; err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); if (err == 0) return 1; return 0; } static int rt6_remove_exception_rt(struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (from->nh) { struct fib6_nh_excptn_arg arg = { .rt = rt, .plen = from->fib6_src.plen }; int rc; /* rc = 1 means an entry was found */ rc = nexthop_for_each_fib6_nh(from->nh, rt6_nh_remove_exception_rt, &arg); return rc ? 0 : -ENOENT; } return fib6_nh_remove_exception(from->fib6_nh, from->fib6_src.plen, rt); } /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; } struct fib6_nh_match_arg { const struct net_device *dev; const struct in6_addr *gw; struct fib6_nh *match; }; /* determine if fib6_nh has given device and gateway */ static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_match_arg *arg = _arg; if (arg->dev != nh->fib_nh_dev || (arg->gw && !nh->fib_nh_gw_family) || (!arg->gw && nh->fib_nh_gw_family) || (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) return 0; arg->match = nh; /* found a match, break the loop */ return 1; } static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct fib6_info *from; struct fib6_nh *fib6_nh; rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; if (from->nh) { struct fib6_nh_match_arg arg = { .dev = rt->dst.dev, .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); if (!arg.match) goto unlock; fib6_nh = arg.match; } else { fib6_nh = from->fib6_nh; } fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. * * If the new MTU is higher, and the route PMTU is equal to the local * MTU, this means the old MTU is the lowest in the path, so allow * updating it: if other nodes now have lower MTUs, PMTU discovery will * handle this. */ if (dst_mtu(&rt->dst) >= mtu) return true; if (dst_mtu(&rt->dst) == idev->cnf.mtu6) return true; return false; } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected * route), the metrics of its rt->from have already * been updated. */ if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } } #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY && ipv6_addr_equal(gateway, &entry->rt6i_gateway)) { rt6_remove_exception(bucket, rt6_ex); } } bucket++; } } spin_unlock_bh(&rt6_exception_lock); } static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_info *rt = rt6_ex->rt6i; /* we are pruning and obsoleting aged-out and non gateway exceptions * even if others have still references to them, so that on next * dst_check() such references can be dropped. * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { pr_debug("purging route %p via non-router but gateway\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } gc_args->more++; } static void fib6_nh_age_exceptions(const struct fib6_nh *nh, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { rt6_age_examine_exception(bucket, rt6_ex, gc_args, now); } bucket++; } } spin_unlock(&rt6_exception_lock); rcu_read_unlock_bh(); } struct fib6_nh_age_excptn_arg { struct fib6_gc_args *gc_args; unsigned long now; }; static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) { struct fib6_nh_age_excptn_arg *arg = _arg; fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); return 0; } void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { if (f6i->nh) { struct fib6_nh_age_excptn_arg arg = { .gc_args = gc_args, .now = now }; nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, &arg); } else { fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); } } /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; redo_rt6_select: rt6_select(net, fn, oif, res, strict); if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; else if (strict & RT6_LOOKUP_F_REACHABLE) { /* also consider unreachable route */ strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; } } trace_fib6_table_lookup(net, res, table, fl6); return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct rt6_info *rt = NULL; int strict = 0; WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held()); strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); if (res.f6i == net->ipv6.fib6_null_entry) goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ rt6_uncached_list_add(rt); rcu_read_unlock(); return rt; } } else { /* Get a percpu copy */ local_bh_disable(); rt = rt6_get_pcpu_route(&res); if (!rt) rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); } out: if (!rt) rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) ip6_hold_safe(net, &rt); rcu_read_unlock(); return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *keys, struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; struct icmp6hdr _icmph; if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) goto out; icmph = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(*icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; _flkeys = NULL; out: if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; keys->tags.flow_label = _flkeys->tags.flow_label; keys->basic.ip_proto = _flkeys->basic.ip_proto; } else { keys->addrs.v6addrs.src = key_iph->saddr; keys->addrs.v6addrs.dst = key_iph->daddr; keys->tags.flow_label = ip6_flowlabel(key_iph); keys->basic.ip_proto = key_iph->nexthdr; } } static u32 rt6_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 rt6_multipath_custom_hash_fl6(const struct net *net, const struct flowi6 *fl6) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = fl6->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = fl6->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } else { hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { struct flow_keys keys; if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, 0); flkeys = &keys; } /* Inner can be v4 or v6 */ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.tags.flow_label = flkeys->tags.flow_label; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = rt6_multipath_custom_hash_skb(net, skb); else mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } return mhash >> 1; } /* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } static struct dst_entry *ip6_route_output_flags_noref(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { bool any_src; if (ipv6_addr_type(&fl6->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; } fl6->flowi6_iif = LOOPBACK_IFINDEX; flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { struct dst_entry *dst; struct rt6_info *rt6; rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); } rcu_read_unlock(); return dst; } EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif } dst_release(dst_orig); return new ? new : ERR_PTR(-ENOMEM); } /* * Destination cache support functions */ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) return false; if (fib6_check_expired(f6i)) return false; return true; } static struct dst_entry *rt6_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { u32 rt_cookie = 0; if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) return NULL; return &rt->dst; } static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { if (!__rt6_check_expired(rt) && READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; struct rt6_info *rt; rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->dst.rt_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); return dst_ret; } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { /* rt/dst can not be destroyed yet, * because of rcu_read_lock() */ sk_dst_reset(sk); rt6_remove_exception_rt(rt); } rcu_read_unlock(); return; } sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; from = rcu_dereference(rt->from); if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); } } static void rt6_update_expires(struct rt6_info *rt0, int timeout) { if (!(rt0->rt6i_flags & RTF_EXPIRES)) { struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. * [see also comment in rt6_mtu_change_route()] */ if (iph) { daddr = &iph->daddr; saddr = &iph->saddr; } else if (sk) { daddr = &sk->sk_v6_daddr; saddr = &inet6_sk(sk)->saddr; } else { daddr = NULL; saddr = NULL; } if (confirm_neigh) dst_confirm_neigh(dst, daddr); if (mtu < IPV6_MIN_MTU) return; if (mtu >= dst_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); /* update rt6_ex->stamp for cache */ if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); if (!res.f6i) goto out_unlock; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ if (!arg.match) goto out_unlock; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } out_unlock: rcu_read_unlock(); } } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_oif = oif, .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { int oif = sk->sk_bound_dev_if; struct dst_entry *dst; if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), sk_uid(sk)); dst = __sk_dst_get(sk); if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; bh_lock_sock(sk); if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ip6_datagram_dst_update(sk, false); bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { #ifdef CONFIG_IPV6_SUBTREES struct ipv6_pinfo *np = inet6_sk(sk); #endif ip6_dst_store(sk, dst, ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : #endif false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, struct flowi6 *fl6, const struct in6_addr *gw, struct rt6_info **ret) { const struct fib6_nh *nh = res->nh; if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || fl6->flowi6_oif != nh->fib_nh_dev->ifindex) return false; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); if (rt_cache && ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { *ret = rt_cache; return true; } return false; } return true; } struct fib6_nh_rd_arg { struct fib6_result *res; struct flowi6 *fl6; const struct in6_addr *gw; struct rt6_info **ret; }; static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_rd_arg *arg = _arg; arg->res->nh = nh; return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); } /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; struct in6_addr gateway; }; INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; struct fib6_nh_rd_arg arg = { .res = &res, .fl6 = fl6, .gw = &rdfl->gateway, .ret = &ret }; struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. * Due to the way the routes are chosen, this notion * is a bit fuzzy and one might need to check all possible * routes. */ rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; if (unlikely(rt->nh)) { if (nexthop_is_blackhole(rt->nh)) continue; /* on match, res->nh is filled in and potentially ret */ if (nexthop_for_each_fib6_nh(rt->nh, fib6_nh_redirect_match, &arg)) goto out; } else { res.nh = rt->fib6_nh; if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) goto out; } } if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } res.f6i = rt; res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); } else { res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; ret = ip6_create_rt_rcu(&res); } rcu_read_unlock(); trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; rdfl.fl6 = *fl6; rdfl.gateway = *gateway; return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .flowi6_mark = mark, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { unsigned int mtu = dst_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); rcu_read_lock(); net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; rcu_read_unlock(); /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. * IPV6_MAXPLEN is also valid and means: "any MSS, * rely only on pmtu discovery" */ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) mtu = IPV6_MAXPLEN; return mtu; } INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device * * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct fib6_nh *nh = res->nh; struct fib6_info *f6i = res->f6i; struct inet6_dev *idev; struct rt6_info *rt; u32 mtu = 0; if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { mtu = f6i->fib6_pmtu; if (mtu) goto out; } rt = rt6_find_cached_rt(res, daddr, saddr); if (unlikely(rt)) { mtu = dst_metric_raw(&rt->dst, RTAX_MTU); } else { struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); if (idev) mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: return dst; } static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, u32 tbid, int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; int err; table = fib6_get_table(net, tbid); if (!table) return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) fib6_select_path(net, res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); return err; } static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; struct fib6_result res = {}; int err; err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && /* ignore match if it is the default route */ !ipv6_addr_any(&res.f6i->fib6_dst.addr) && (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } return err; } static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; int flags = RT6_LOOKUP_F_IFACE; struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { err = ip6_nh_lookup_table(net, cfg, gw_addr, cfg->fc_table, flags, &res); /* gw_addr can not require a gateway or resolve to a reject * route. If a device is given, it must match the result. */ if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family || (dev && dev != res.nh->fib_nh_dev)) err = -EHOSTUNREACH; } if (err < 0) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, }; err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family) err = -EHOSTUNREACH; if (err) return err; fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); } err = 0; if (dev) { if (dev != res.nh->fib_nh_dev) err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } return err; } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; int gwa_type = ipv6_addr_type(gw_addr); bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; const struct net_device *dev = *_dev; bool need_addr_check = !dev; int err = -EINVAL; /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() * will return already-added prefix route via interface that * prefix route was assigned to, which might be non-loopback. */ if (dev && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { /* IPv6 strictly inhibits using not link-local * addresses as nexthop address. * Otherwise, router will not able to send redirects. * It is very good, but in some (rare!) circumstances * (SIT, PtP, NBMA NOARP links) it is handy to allow * some exceptions. --ANK * We allow IPv4-mapped nexthops to support RFC4798-type * addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; } rcu_read_lock(); if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, idev); rcu_read_unlock(); if (err) goto out; } /* reload in case device was changed */ dev = *_dev; err = -EINVAL; if (!dev) { NL_SET_ERR_MSG(extack, "Egress device not specified"); goto out; } else if (dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Egress device can not be loopback device for this route"); goto out; } /* if we did not check gw_addr above, do so now that the * egress device has been resolved. */ if (need_addr_check && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } err = 0; out: return err; } static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) { if ((flags & RTF_REJECT) || (dev && (dev->flags & IFF_LOOPBACK) && !(addr_type & IPV6_ADDR_LOOPBACK) && !(flags & (RTF_ANYCAST | RTF_LOCAL)))) return true; return false; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif if (cfg->fc_is_fdb) { fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; return 0; } err = -ENODEV; if (cfg->fc_ifindex) { dev = netdev_get_by_index(net, cfg->fc_ifindex, dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); if (!idev) goto out; } if (cfg->fc_flags & RTNH_F_ONLINK) { if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); goto out; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; } fib6_nh->fib_nh_weight = 1; /* We cannot add true routes via loopback here, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; goto out; } } goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { err = ip6_validate_gw(net, cfg, &dev, dev_tracker, &idev, extack); if (err) goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; if (!dev) goto out; if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; } if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; pcpu_alloc: fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { err = -ENOMEM; goto out; } fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: if (idev) in6_dev_put(idev); if (err) { fib_nh_common_release(&fib6_nh->nh_common); fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } return err; } void fib6_nh_release(struct fib6_nh *fib6_nh) { struct rt6_exception_bucket *bucket; rcu_read_lock(); fib6_nh_flush_exceptions(fib6_nh, NULL); bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); if (bucket) { rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); kfree(bucket); } rcu_read_unlock(); fib6_nh_release_dsts(fib6_nh); free_percpu(fib6_nh->rt6i_pcpu); fib_nh_common_release(&fib6_nh->nh_common); } void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) { int cpu; if (!fib6_nh->rt6i_pcpu) return; for_each_possible_cpu(cpu) { struct rt6_info *pcpu_rt, **ppcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = xchg(ppcpu_rt, NULL); if (pcpu_rt) { dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); } } } static int fib6_config_validate(struct fib6_config *cfg, struct netlink_ext_ack *extack) { /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto errout; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); goto errout; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); goto errout; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto errout; } #ifdef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); goto errout; } if (cfg->fc_nh_id && cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); goto errout; } #else if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); goto errout; } #endif return 0; errout: return -EINVAL; } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_table *table; struct fib6_info *rt; int err; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); if (!table) { pr_warn("NLM_F_CREATE should be specified when creating new route\n"); table = fib6_new_table(net, cfg->fc_table); } } else { table = fib6_new_table(net, cfg->fc_table); } if (!table) { err = -ENOBUFS; goto err; } rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); if (!rt) { err = -ENOMEM; goto err; } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); goto free; } if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif return rt; free: kfree(rt); err: return ERR_PTR(err); } static int ip6_route_info_create_nh(struct fib6_info *rt, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_nh *fib6_nh; int err; if (cfg->fc_nh_id) { struct nexthop *nh; rcu_read_lock(); nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { err = -EINVAL; NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } err = fib6_check_nexthop(nh, cfg, extack); if (err) goto out_free; if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); rcu_read_unlock(); } else { int addr_type; err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out_release; fib6_nh = rt->fib6_nh; /* We cannot add true routes via loopback here, they would * result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; } return 0; out_release: fib6_info_release(rt); return err; out_free: rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_info *rt; int err; err = fib6_config_validate(cfg, extack); if (err) return err; rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); if (err) return err; err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); return err; } static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; int err; if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); out: fib6_info_release(rt); return err; } int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { struct nl_info info = { .nl_net = net, .skip_notify = skip_notify }; return __ip6_del_rt(rt, &info); } static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; int err = -ENOENT; if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); skb = NULL; } else info->skip_notify = 1; } /* 'rt' points to the first sibling route. If it is not the * leaf, then we do not need to send a notification. Otherwise, * we need to check if the last sibling has a next route or not * and emit a replace or delete notification, respectively. */ info->skip_notify_kernel = 1; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (rcu_access_pointer(fn->leaf) == rt) { struct fib6_info *last_sibling, *replace_rt; last_sibling = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); replace_rt = rcu_dereference_protected( last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock)); if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); else call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL); } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; } } err = fib6_del(rt, info); out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; } static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) goto out; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; rc = rt6_remove_exception_rt(rt); out: return rc; } static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, struct fib6_nh *nh) { struct fib6_result res = { .f6i = rt, .nh = nh, }; struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); if (rt_cache) return __ip6_del_cached_rt(rt_cache, cfg); return 0; } struct fib6_nh_del_cached_rt_arg { struct fib6_config *cfg; struct fib6_info *f6i; }; static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_del_cached_rt_arg *arg = _arg; int rc; rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); return rc != -ESRCH ? rc : 0; } static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) { struct fib6_nh_del_cached_rt_arg arg = { .cfg = cfg, .f6i = f6i }; return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); } static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); if (!table) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; } rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, !(cfg->fc_flags & RTF_CACHE)); if (fn) { for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; if (rt->nh && cfg->fc_nh_id && rt->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_flags & RTF_CACHE) { int rc = 0; if (rt->nh) { rc = ip6_del_cached_rt_nh(cfg, rt); } else if (cfg->fc_nh_id) { continue; } else { nh = rt->fib6_nh; rc = ip6_del_cached_rt(cfg, rt, nh); } if (rc != -ESRCH) { rcu_read_unlock(); return rc; } continue; } if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; err = __ip6_del_rt(rt, &cfg->fc_nlinfo); break; } if (cfg->fc_nh_id) continue; nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (!fib6_info_hold_safe(rt)) continue; /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) err = __ip6_del_rt(rt, &cfg->fc_nlinfo); else err = __ip6_del_rt_siblings(rt, cfg); break; } } rcu_read_unlock(); return err; } static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); return; } msg = (struct rd_msg *)icmp6_hdr(skb); if (ipv6_addr_is_multicast(&msg->dest)) { net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); return; } on_link = 0; if (ipv6_addr_equal(&msg->dest, &msg->target)) { on_link = 1; } else if (ipv6_addr_type(&msg->target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); return; } in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; if (READ_ONCE(in6_dev->cnf.forwarding) || !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: * The IP source address of the Redirect MUST be the same as the current * first-hop router for the specified ICMP Destination Address. */ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } lladdr = NULL; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, skb->dev); if (!lladdr) { net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); return; } } rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } /* Redirect received -> path was valid. * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) return; /* * We have finally decided to accept it. */ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); rcu_read_lock(); res.f6i = rcu_dereference(rt->from); if (!res.f6i) goto out; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev. Should be impossible */ if (!arg.match) goto out; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* rt6_insert_exception() will take care of duplicated exceptions */ if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } netevent.old = &rt->dst; netevent.new = &nrt->dst; netevent.daddr = &msg->dest; netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: rcu_read_unlock(); neigh_release(neigh); } #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; for_each_fib6_node_rt_rcu(fn) { /* these routes do not use nexthops */ if (rt->nh) continue; if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || !rt->fib6_nh->fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; break; } out: rcu_read_unlock(); return rt; } static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { struct fib6_config cfg = { .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, }; cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) continue; nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) rt = NULL; rcu_read_unlock(); return rt; } struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, u32 defrtr_usr_metric, int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); if (table) table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } return rt6_get_dflt_router(net, gwaddr, dev); } static void __rt6_purge_dflt_routers(struct net *net, struct fib6_table *table) { struct fib6_info *rt; restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct net_device *dev = fib6_info_nh_dev(rt); struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt, false); goto restart; } } rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } void rt6_purge_dflt_routers(struct net *net) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) __rt6_purge_dflt_routers(net, table); } } rcu_read_unlock(); } static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { *cfg = (struct fib6_config){ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, .fc_flags = rtmsg->rtmsg_flags, .fc_type = rtmsg->rtmsg_type, .fc_nlinfo.nl_net = net, .fc_dst = rtmsg->rtmsg_dst, .fc_src = rtmsg->rtmsg_src, .fc_gateway = rtmsg->rtmsg_gateway, }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; int err; if (cmd != SIOCADDRT && cmd != SIOCDELRT) return -EINVAL; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtmsg_to_fib6_config(net, rtmsg, &cfg); switch (cmd) { case SIOCADDRT: /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } return err; } /* * Drop the packet on the floor */ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { SKB_DR_SET(reason, IP_INADDRERRORS); IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } SKB_DR_SET(reason, IP_INNOROUTES); fallthrough; case IPSTATS_MIB_OUTNOROUTES: SKB_DR_OR(reason, IP_OUTNOROUTES); IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } /* Start over by dropping the dst for l3mdev case */ if (netif_is_l3_master(skb->dev)) skb_dst_drop(skb); icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb_reason(skb, reason); return 0; } static int ip6_pkt_discard(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } /* * Allocate a dst for local (unicast / anycast) address. */ struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, .fc_ifindex = idev->dev->ifindex, .fc_flags = RTF_UP | RTF_NONEXTHOP, .fc_dst = *addr, .fc_dst_len = 128, .fc_protocol = RTPROT_KERNEL, .fc_nlinfo.nl_net = net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; cfg.fc_flags |= RTF_ANYCAST; } else { cfg.fc_type = RTN_LOCAL; cfg.fc_flags |= RTF_LOCAL; } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (IS_ERR(f6i)) return f6i; err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); if (err) return ERR_PTR(err); f6i->dst_nocount = true; if (!anycast && (READ_ONCE(net->ipv6.devconf_all->disable_policy) || READ_ONCE(idev->cnf.disable_policy))) f6i->dst_nopolicy = true; return f6i; } /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; spin_unlock_bh(&rt6_exception_lock); } return 0; } void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { .net = net, .addr = &ifp->addr, }; fib6_clean_all(net, fib6_remove_prefsrc, &adni); } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) return 0; nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) { fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_netdev_event { const struct net_device *dev; union { unsigned char nh_flags; unsigned long event; }; }; static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; } /* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; } static int rt6_multipath_total_weight(const struct fib6_info *rt) { struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh->fib_nh_weight; } return total; } static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; if (!rt6_is_dead(rt)) { *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } void rt6_multipath_rebalance(struct fib6_info *rt) { struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, * then there is no need to rebalance upon the removal of every * sibling route. */ if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to * make sure upper bounds are assigned from the first sibling * onwards. */ first = rt6_multipath_first_sibling(rt); if (WARN_ON_ONCE(!first)) return; total = rt6_multipath_total_weight(first); rt6_multipath_upper_bound_set(first, total); } static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && !rt->nh && rt->fib6_nh->fib_nh_dev == arg->dev) { rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } return 0; } void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, { .nh_flags = nh_flags, }, }; if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) arg.nh_flags |= RTNH_F_LINKDOWN; fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } /* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; } static void rt6_multipath_flush(struct fib6_info *rt) { struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh->fib_nh_dev == down_dev || rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == down_dev || iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; } static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned char nh_flags) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } return 0; } void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, { .event = event, }, }; struct net *net = dev_net(dev); if (net->ipv6.sysctl.skip_notify_on_dev_down) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); rt6_uncached_list_flush_dev(dev); neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; struct fib6_info *f6i; }; static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; struct fib6_info *f6i = arg->f6i; /* For administrative MTU increase, there is no way to discover * IPv6 PMTU increase, so PMTU increase should be updated here. * Since RFC 1981 doesn't include administrative MTU increase * update PMTU increase is a MUST. (i.e. jumbo frame) */ if (nh->fib_nh_dev == arg->dev) { struct inet6_dev *idev = __in6_dev_get(arg->dev); u32 mtu = f6i->fib6_pmtu; if (mtu >= arg->mtu || (mtu < arg->mtu && mtu == idev->cnf.mtu6)) fib6_metric_set(f6i, RTAX_MTU, arg->mtu); spin_lock_bh(&rt6_exception_lock); rt6_exceptions_update_pmtu(idev, nh, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; } static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. We still use this lock to block changes caused by addrconf/ndisc. */ idev = __in6_dev_get(arg->dev); if (!idev) return 0; if (fib6_metric_locked(f6i, RTAX_MTU)) return 0; arg->f6i = f6i; if (f6i->nh) { /* fib6_nh_mtu_change only returns 0, so this is safe */ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, arg); } return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, struct netlink_ext_ack *extack, bool newroute) { struct rtnexthop *rtnh; int remaining; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); return -EINVAL; } do { bool has_gateway = cfg->fc_flags & RTF_GATEWAY; int attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs; attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { if (nla_len(nla) < sizeof(cfg->fc_gateway)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); return -EINVAL; } has_gateway = true; } } if (newroute && (cfg->fc_nh_id || !has_gateway)) { NL_SET_ERR_MSG(extack, "Device only routes can not be added for IPv6 using the multipath API."); return -EINVAL; } rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; struct rtmsg *rtm; unsigned int pref; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_tos) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): option not available for IPv6"); goto errout; } if (tb[RTA_FLOWLABEL]) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Flow label cannot be specified for this operation"); goto errout; } *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, .fc_src_len = rtm->rtm_src_len, .fc_flags = RTF_UP, .fc_protocol = rtm->rtm_protocol, .fc_type = rtm->rtm_type, .fc_nlinfo.portid = NETLINK_CB(skb).portid, .fc_nlinfo.nlh = nlh, .fc_nlinfo.nl_net = sock_net(skb->sk), }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || rtm->rtm_type == RTN_PROHIBIT || rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); if (tb[RTA_NH_ID]) { if (tb[RTA_GATEWAY] || tb[RTA_OIF] || tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); goto errout; } cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); } if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } if (tb[RTA_VIA]) { NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); goto errout; } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; if (nla_len(tb[RTA_DST]) < plen) goto errout; nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); } if (tb[RTA_SRC]) { int plen = (rtm->rtm_src_len + 7) >> 3; if (nla_len(tb[RTA_SRC]) < plen) goto errout; nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } if (tb[RTA_PREFSRC]) cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_PRIORITY]) cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); if (tb[RTA_METRICS]) { cfg->fc_mx = nla_data(tb[RTA_METRICS]); cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); } if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } if (tb[RTA_PREF]) { pref = nla_get_u8(tb[RTA_PREF]); if (pref != ICMPV6_ROUTER_PREF_LOW && pref != ICMPV6_ROUTER_PREF_HIGH) pref = ICMPV6_ROUTER_PREF_MEDIUM; cfg->fc_flags |= RTF_PREF(pref); } if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); if (addrconf_finite_timeout(timeout)) { cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); cfg->fc_flags |= RTF_EXPIRES; } } err = 0; errout: return err; } struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head list; }; static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->list, rt6_nh_list); return 0; } static void ip6_route_mpath_notify(struct fib6_info *rt, struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { /* if this is an APPEND route, then rt points to the first route * inserted and rt_last points to last route inserted. Userspace * wants a consistent dump of the route which starts at the first * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ rcu_read_lock(); if ((nlflags & NLM_F_APPEND) && rt_last && READ_ONCE(rt_last->fib6_nsiblings)) { rt = list_first_or_null_rcu(&rt_last->fib6_siblings, struct fib6_info, fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) { bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool should_notify = false; struct fib6_info *leaf; struct fib6_node *fn; rcu_read_lock(); fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; leaf = rcu_dereference(fn->leaf); if (!leaf) goto out; if (rt == leaf || (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf))) should_notify = true; out: rcu_read_unlock(); return should_notify; } static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; int replace; int nhn = 0; int err; err = fib6_config_validate(cfg, extack); if (err) return err; replace = (cfg->fc_nlinfo.nlh && (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto cleanup; } err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); if (err) { rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; } rtnh = rtnh_next(rtnh, &remaining); } /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; /* For add and replace, send one notification with all nexthops. For * append, send one notification with all appended nexthops. */ info->skip_notify_kernel = 1; err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, "multipath route replace failed (check consistency of installed routes)"); err_nh = nh; goto add_errout; } /* save reference to last route successfully inserted */ rt_last = nh->fib6_info; /* save reference to first route for notification */ if (!rt_notif) rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: * fib6_add_rt2node() has rejected it; when replacing, old * nexthops have been replaced by first new, the rest should * be added to it. */ if (cfg->fc_nlinfo.nlh) { cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; } nhn++; } /* An in-kernel notification should only be sent in case the new * multipath route is added as the first route in the node, or if * it was appended to it. We pass 'rt_notif' since it is the first * sibling and might allow us to skip some checks in the replace case. */ if (ip6_route_mpath_should_notify(rt_notif)) { enum fib_event_type fib_event; if (rt_notif->fib6_nsiblings != nhn - 1) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_multipath_entry_notifiers(info->nl_net, fib_event, rt_notif, nhn - 1, extack); if (err) { /* Delete all the siblings that were just added */ err_nh = NULL; goto add_errout; } } /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; add_errout: /* send notification for routes that were added so that * the delete notifications sent by ip6_route_del are * coherent */ if (rt_notif) ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); list_del(&nh->list); kfree(nh); } return err; } static int ip6_route_multipath_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; int last_err = 0; int remaining; int attrlen; int err; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; rtnh = rtnh_next(rtnh, &remaining); } return last_err; } static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_nh_id) { rcu_read_lock(); err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); return -EINVAL; } } if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } } static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else return ip6_route_add(&cfg, GFP_KERNEL, extack); } /* add the overhead of this fib6_nh to nexthop_len */ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { int *nexthop_len = arg; *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16); /* RTA_GATEWAY */ if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); /* RTA_ENCAP */ *nexthop_len += nla_total_size(2); } return 0; } static size_t rt6_nlmsg_size(struct fib6_info *f6i) { struct fib6_info *sibling; struct fib6_nh *nh; int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); goto common; } rcu_read_lock(); retry: nh = f6i->fib6_nh; nexthop_len = 0; if (READ_ONCE(f6i->fib6_nsiblings)) { rt6_nh_nlmsg_size(nh, &nexthop_len); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); if (!READ_ONCE(f6i->fib6_nsiblings)) goto retry; } } rcu_read_unlock(); nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); common: return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ + nla_total_size(16) /* RTA_GATEWAY */ + nla_total_size(16) /* RTA_PREFSRC */ + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_IIF */ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ + nexthop_len; } static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, unsigned char *flags) { if (nexthop_is_multipath(nh)) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) goto nla_put_failure; nla_nest_end(skb, mp); } else { struct fib6_nh *fib6_nh; fib6_nh = nexthop_fib6_nh(nh); if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, flags, false) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; if (rt6) { rt6_dst = &rt6->rt6i_dst; rt6_src = &rt6->rt6i_src; rt6_flags = rt6->rt6i_flags; } else { rt6_dst = &rt->fib6_dst; rt6_src = &rt->fib6_src; rt6_flags = rt->fib6_flags; } rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; rtm->rtm_dst_len = rt6_dst->plen; rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) return 0; if (err < 0) goto nla_put_failure; } else #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { struct net_device *dev; if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; dev = dst_dev(dst); if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (READ_ONCE(rt->fib6_nsiblings)) { struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, rt->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } else { if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, &nh_flags, false) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } if (!dst) { if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) { const struct net_device *dev = arg; if (nh->fib_nh_dev == dev) return 1; return 0; } static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { if (f6i->nh) { struct net_device *_dev = (struct net_device *)dev; return !!nexthop_for_each_fib6_nh(f6i->nh, fib6_info_nh_uses_dev, _dev); } if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (READ_ONCE(f6i->fib6_nsiblings)) { const struct fib6_info *sibling; rcu_read_lock(); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { if (sibling->fib6_nh->fib_nh_dev == dev) { rcu_read_unlock(); return true; } if (!READ_ONCE(f6i->fib6_nsiblings)) break; } rcu_read_unlock(); } return false; } struct fib6_nh_exception_dump_walker { struct rt6_rtnl_dump_arg *dump; struct fib6_info *rt; unsigned int flags; unsigned int skip; unsigned int count; }; static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_nh_exception_dump_walker *w = arg; struct rt6_rtnl_dump_arg *dump = w->dump; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i, err; bucket = fib6_nh_get_excptn_bucket(nh, NULL); if (!bucket) return 0; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (w->skip) { w->skip--; continue; } /* Expiration of entries doesn't bump sernum, insertion * does. Removal is triggered by insertion, so we can * rely on the fact that if entries change between two * partial dumps, this node is scanned again completely, * see rt6_insert_exception() and fib6_dump_table(). * * Count expired entries we go through as handled * entries that we'll skip next time, in case of partial * node dump. Otherwise, if entries expire meanwhile, * we'll skip the wrong amount. */ if (rt6_check_expired(rt6_ex->rt6i)) { w->count++; continue; } err = rt6_fill_node(dump->net, dump->skb, w->rt, &rt6_ex->rt6i->dst, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(dump->cb->skb).portid, dump->cb->nlh->nlmsg_seq, w->flags); if (err) return err; w->count++; } bucket++; } return 0; } /* Return -1 if done with node, number of handled routes on partial dump */ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; int count = 0; if (rt == net->ipv6.fib6_null_entry) return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return -1; } if (filter->filter_set && ((filter->rt_type && rt->fib6_type != filter->rt_type) || (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || (filter->protocol && rt->fib6_protocol != filter->protocol))) { return -1; } if (filter->filter_set || !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } if (filter->dump_routes) { if (skip) { skip--; } else { if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, flags)) { return 0; } count++; } } if (filter->dump_exceptions) { struct fib6_nh_exception_dump_walker w = { .dump = arg, .rt = rt, .flags = flags, .skip = skip, .count = 0 }; int err; rcu_read_lock(); if (rt->nh) { err = nexthop_for_each_fib6_nh(rt->nh, rt6_nh_dump_exceptions, &w); } else { err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); } rcu_read_unlock(); if (err) return count + w.count; } return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } if (tb[RTA_FLOWLABEL] && (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Invalid flow label"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_IIF: case RTA_OIF: case RTA_MARK: case RTA_UID: case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (tb[RTA_UID]) fl6.flowi6_uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); if (tb[RTA_SPORT]) fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &fl6.flowi6_proto, AF_INET6, extack); if (err) goto errout; } flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); if (iif) { struct net_device *dev; int flags = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, iif); if (!dev) { rcu_read_unlock(); err = -ENODEV; goto errout; } fl6.flowi6_iif = iif; if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; dst = ip6_route_output(net, NULL, &fl6); } rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } if (rt == net->ipv6.ip6_null_entry) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); err = -ENOBUFS; goto errout; } skb_dst_set(skb, &rt->dst); rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { if (fibmatch) err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } else { err = -ENETUNREACH; } rcu_read_unlock(); if (err < 0) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct net *net = info->nl_net; struct sk_buff *skb; size_t sz; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; rcu_read_lock(); sz = rt6_nlmsg_size(rt); retry: skb = nlmsg_new(sz, GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { kfree_skb(skb); /* -EMSGSIZE implies needed space grew under us. */ if (err == -EMSGSIZE) { sz = max(rt6_nlmsg_size(rt), sz << 1); goto retry; } goto errout; } rcu_read_unlock(); rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { struct sk_buff *skb; int err; if (READ_ONCE(f6i->offload) == offload && READ_ONCE(f6i->trap) == trap && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send * notification. */ return; if (!net->ipv6.sysctl.fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } EXPORT_SYMBOL(fib6_info_hw_flags_set); static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (!(dev->flags & IFF_LOOPBACK)) return NOTIFY_OK; if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } else if (event == NETDEV_UNREGISTER && dev->reg_state != NETREG_UNREGISTERED) { /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } return NOTIFY_OK; } /* * /proc */ #ifdef CONFIG_PROC_FS static int rt6_stats_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; int ret; if (!write) return -EINVAL; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; net = (struct net *)ctl->extra1; delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } static struct ctl_table ipv6_route_table_template[] = { { .procname = "max_size", .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", .data = &ip6_dst_ops_template.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_timeout", .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_elasticity", .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "skip_notify_on_dev_down", .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_route_table_template, sizeof(ipv6_route_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.flush_delay; table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; } return table; } size_t ipv6_route_sysctl_table_size(struct net *net) { /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) return 1; return ARRAY_SIZE(ipv6_route_table_template); } #endif static int __net_init ip6_route_net_init(struct net *net) { int ret = -ENOMEM; memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_SUBTREES net->ipv6.fib6_routes_require_src = 0; #endif #endif net->ipv6.sysctl.flush_delay = 0; net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: return ret; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_ip6_prohibit_entry: kfree(net->ipv6.ip6_prohibit_entry); out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif out_fib6_null_entry: kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } static void __net_exit ip6_route_net_exit(struct net *net) { kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, sizeof(struct ipv6_route_iter))) return -ENOMEM; if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, rt6_stats_seq_show, NULL)) { remove_proc_entry("ipv6_route", net->proc_net); return -ENOMEM; } #endif return 0; } static void __net_exit ip6_route_net_exit_late(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ipv6_route", net->proc_net); remove_proc_entry("rt6_stats", net->proc_net); #endif } static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, }; static int __net_init ipv6_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv6.peers = bp; return 0; } static void __net_exit ipv6_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv6.peers; net->ipv6.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, }; static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; void __init ip6_route_init_special_entries(void) { /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif } #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), }; static struct bpf_iter_reg ipv6_route_reg_info = { .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) { bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip6_route_init(void) { int ret; int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; ret = register_pernet_subsys(&ipv6_inetpeer_ops); if (ret) goto out_dst_entries; ret = register_pernet_subsys(&ip6_route_net_ops); if (ret) goto out_register_inetpeer; ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; ret = fib6_init(); if (ret) goto out_register_subsys; ret = xfrm6_init(); if (ret) goto out_fib6_init; ret = fib6_rules_init(); if (ret) goto xfrm6_init; ret = register_pernet_subsys(&ip6_route_net_late_ops); if (ret) goto fib6_rules_init; ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); if (ret) goto out_register_late_subsys; #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) goto out_register_late_subsys; #endif #endif for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } out: return ret; out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: xfrm6_fini(); out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); out_register_inetpeer: unregister_pernet_subsys(&ipv6_inetpeer_ops); out_dst_entries: dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; } void ip6_route_cleanup(void) { #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_unregister(); #endif #endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ipv6_inetpeer_ops); unregister_pernet_subsys(&ip6_route_net_ops); dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); }
17 9 4 4 4 3 1 15 15 15 14 1 8 8 8 7 1 23 14 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 // SPDX-License-Identifier: GPL-2.0-only /* * Transparent proxy support for Linux/iptables * * Copyright (C) 2007-2008 BalaBit IT Ltd. * Author: Krisztian Kovacs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> #include <net/sock.h> #include <net/inet_sock.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/inet6_hashtables.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/netfilter/nf_socket.h> #include <linux/netfilter/xt_socket.h> /* "socket" match based redirection (no specific rule) * =================================================== * * There are connections with dynamic endpoints (e.g. FTP data * connection) that the user is unable to add explicit rules * for. These are taken care of by a generic "socket" rule. It is * assumed that the proxy application is trusted to open such * connections without explicit iptables rule (except of course the * generic 'socket' rule). In this case the following sockets are * matched in preference order: * * - match: if there's a fully established connection matching the * _packet_ tuple * * - match: if there's a non-zero bound listener (possibly with a * non-local address) We don't accept zero-bound listeners, since * then local services could intercept traffic going through the * box. */ static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) { struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); if (sk) { bool wildcard; bool transparent = true; /* Ignore sockets listening on INADDR_ANY, * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && sk_fullsock(sk) && inet_sk(sk)->inet_rcv_saddr == 0); /* Ignore non-transparent sockets, * if XT_SOCKET_TRANSPARENT is used */ if (info->flags & XT_SOCKET_TRANSPARENT) transparent = inet_sk_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); if (wildcard || !transparent) sk = NULL; } return sk != NULL; } static bool socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) { static struct xt_socket_mtinfo1 xt_info_v0 = { .flags = 0, }; return socket_match(skb, par, &xt_info_v0); } static bool socket_mt4_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static bool socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); if (sk) { bool wildcard; bool transparent = true; /* Ignore sockets listening on INADDR_ANY * unless XT_SOCKET_NOWILDCARD is set */ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && sk_fullsock(sk) && ipv6_addr_any(&sk->sk_v6_rcv_saddr)); /* Ignore non-transparent sockets, * if XT_SOCKET_TRANSPARENT is used */ if (info->flags & XT_SOCKET_TRANSPARENT) transparent = inet_sk_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); if (wildcard || !transparent) sk = NULL; } return sk != NULL; } #endif static int socket_mt_enable_defrag(struct net *net, int family) { switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif } WARN_ONCE(1, "Unknown family %d\n", family); return 0; } static int socket_mt_v1_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; int err; err = socket_mt_enable_defrag(par->net, par->family); if (err) return err; if (info->flags & ~XT_SOCKET_FLAGS_V1) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); return -EINVAL; } return 0; } static int socket_mt_v2_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; int err; err = socket_mt_enable_defrag(par->net, par->family); if (err) return err; if (info->flags & ~XT_SOCKET_FLAGS_V2) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); return -EINVAL; } return 0; } static int socket_mt_v3_check(const struct xt_mtchk_param *par) { const struct xt_socket_mtinfo3 *info = (struct xt_socket_mtinfo3 *)par->matchinfo; int err; err = socket_mt_enable_defrag(par->net, par->family); if (err) return err; if (info->flags & ~XT_SOCKET_FLAGS_V3) { pr_info_ratelimited("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V3); return -EINVAL; } return 0; } static void socket_mt_destroy(const struct xt_mtdtor_param *par) { if (par->family == NFPROTO_IPV4) nf_defrag_ipv4_disable(par->net); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (par->family == NFPROTO_IPV6) nf_defrag_ipv6_disable(par->net); #endif } static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", .revision = 0, .family = NFPROTO_IPV4, .match = socket_mt4_v0, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .destroy = socket_mt_destroy, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .destroy = socket_mt_destroy, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #endif { .name = "socket", .revision = 2, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "socket", .revision = 2, .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #endif { .name = "socket", .revision = 3, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v3_check, .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "socket", .revision = 3, .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v3_check, .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, #endif }; static int __init socket_mt_init(void) { return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); } static void __exit socket_mt_exit(void) { xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); } module_init(socket_mt_init); module_exit(socket_mt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); MODULE_DESCRIPTION("x_tables socket match module"); MODULE_ALIAS("ipt_socket"); MODULE_ALIAS("ip6t_socket");
3 3 3 50 50 1 1 3 49 41 9 50 1 48 1 1 48 1 10 42 3 3 3 3 1 3 3 3 3 3 3 47 7 1 40 1 47 46 46 47 47 43 2 2 2 2 3 1 1 1 1 1 3 2 1 3 3 2 8 8 4 2 1 2 6 2 4 5 5 2 2 3 1 1 1 2 3 3 3 4 2 1 2 2 1 3 2 1 1 4 4 3 3 4 4 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> #endif #include "br_private.h" static bool br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { *timer = br_timer_value(&pmctx->ip4_mc_router_timer); return !hlist_unhashed(&pmctx->ip4_rlist); } static bool br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) *timer = br_timer_value(&pmctx->ip6_mc_router_timer); return !hlist_unhashed(&pmctx->ip6_rlist); #else *timer = 0; return false; #endif } static size_t __br_rports_one_size(void) { return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } size_t br_rports_size(const struct net_bridge_mcast *brmctx) { struct net_bridge_mcast_port *pmctx; size_t size = nla_total_size(0); /* MDBA_ROUTER */ rcu_read_lock(); hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) size += __br_rports_one_size(); #if IS_ENABLED(CONFIG_IPV6) hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) size += __br_rports_one_size(); #endif rcu_read_unlock(); return size; } int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx) { u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { struct net_bridge_mcast_port *pmctx; if (vid) { struct net_bridge_vlan *v; v = br_vlan_find(nbp_vlan_group(p), vid); if (!v) continue; pmctx = &v->port_mcast_ctx; } else { pmctx = &p->multicast_ctx; } have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, ip6_timer)) || (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } nla_nest_end(skb, port_nest); } nla_nest_end(skb, nest); return 0; fail: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) { e->state = flags & MDB_PG_FLAGS_PERMANENT; e->flags = 0; if (flags & MDB_PG_FLAGS_OFFLOAD) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; switch (ip->proto) { case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); break; #endif default: ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr); } } static int __mdb_fill_srcs(struct sk_buff *skb, struct net_bridge_port_group *p) { struct net_bridge_group_src *ent; struct nlattr *nest, *nest_ent; if (hlist_empty(&p->src_list)) return 0; nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); if (!nest) return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } break; #endif default: nla_nest_cancel(skb, nest_ent); continue; } if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, br_timer_value(&ent->timer))) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } nla_nest_end(skb, nest_ent); } nla_nest_end(skb, nest); return 0; out_cancel_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; u8 flags = 0; int ifindex; memset(&e, 0, sizeof(e)); if (p) { ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { ifindex = mp->br->dev->ifindex; mtimer = &mp->timer; } __mdb_entry_fill_flags(&e, flags); e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) { e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) } else if (mp->addr.proto == htons(ETH_P_IPV6)) { e.addr.u.ip6 = mp->addr.dst.ip6; #endif } else { ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); e.state = MDB_PERMANENT; } e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO); if (!nest_ent) return -EMSGSIZE; if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) goto nest_err; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) goto nest_err; break; } break; #endif default: ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr); } if (p) { if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) goto nest_err; if (dump_srcs_mode && (__mdb_fill_srcs(skb, p) || nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) goto nest_err; } nla_nest_end(skb, nest_ent); return 0; nest_err: nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; if (idx < s_idx) goto skip; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; } if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); break; } } for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_end(skb, nest2); goto out; } skip_pg: pidx++; } pidx = 0; s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; } out: cb->args[1] = idx; cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->ifindex = dev->ifindex; rcu_read_lock(); err = br_mdb_fill_info(skb, cb, dev); if (err) goto out; err = br_rports_fill_info(skb, &br->multicast_ctx); if (err) goto out; out: rcu_read_unlock(); nlmsg_end(skb, nlh); return err; } static int nlmsg_populate_mdb_fill(struct sk_buff *skb, struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct nlmsghdr *nlh; struct br_port_msg *bpm; struct nlattr *nest, *nest2; nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; if (__mdb_fill_info(skb, mp, pg)) goto end; nla_nest_end(skb, nest2); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; size_t nlmsg_size, addr_size = 0; /* MDBA_MDB_ENTRY_INFO */ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) + /* MDBA_MDB_EATTR_TIMER */ nla_total_size(sizeof(u32)); if (!pg) goto out; /* MDBA_MDB_EATTR_RTPROT */ nlmsg_size += nla_total_size(sizeof(u8)); switch (pg->key.addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; #endif } /* MDBA_MDB_EATTR_GROUP_MODE */ nlmsg_size += nla_total_size(sizeof(u8)); /* MDBA_MDB_EATTR_SRC_LIST nested attr */ if (!hlist_empty(&pg->src_list)) nlmsg_size += nla_total_size(0); hlist_for_each_entry(ent, &pg->src_list, node) { /* MDBA_MDB_SRCLIST_ENTRY nested attr + * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER */ nlmsg_size += nla_total_size(0) + nla_total_size(addr_size) + nla_total_size(sizeof(u32)); } out: return nlmsg_size; } static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0) + /* Port group entry */ rtnl_mdb_nlmsg_pg_size(pg); } static void __br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (notify_switchdev) br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { __br_mdb_notify(dev, mp, pg, type, true); } void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg) { __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto end; if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { nla_nest_cancel(skb, port_nest); goto end; } if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { nla_nest_cancel(skb, port_nest); goto end; } nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; end: nla_nest_end(skb, nest); cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) + nla_total_size(sizeof(__u32)) + nla_total_size(sizeof(u16)); } void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; u16 vid; ifindex = pmctx ? pmctx->port->dev->ifindex : 0; vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } static const struct nla_policy br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static const struct nla_policy br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), }; static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, struct netlink_ext_ack *extack) { switch (proto) { case htons(ETH_P_IP): if (nla_len(attr) != sizeof(struct in_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); return false; } if (ipv4_is_multicast(nla_get_in_addr(attr))) { NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); return false; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr src; if (nla_len(attr) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); return false; } src = nla_get_in6_addr(attr); if (ipv6_addr_is_multicast(&src)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); return false; } break; } #endif default: NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); return false; } return true; } static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { struct net_bridge_mcast *brmctx = NULL; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { brmctx = &br->multicast_ctx; goto out; } if (!entry->vid) { NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); goto out; } v = br_vlan_find(br_vlan_group(br), entry->vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); goto out; } if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); goto out; } brmctx = &v->br_mcast_ctx; out: return brmctx; } static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags) { unsigned long now = jiffies; pg->flags = flags; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); return 0; } static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_sg(cfg, mp, p, brmctx, flags); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, MCAST_INCLUDE, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for * proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { struct net_bridge_mdb_entry *star_mp; struct br_ip star_group; star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(cfg->br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); } return 0; } static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, struct br_ip *src_ip, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *sgmp; struct br_mdb_config sg_cfg; struct br_ip sg_ip; u8 flags = 0; sg_ip = cfg->group; sg_ip.src = src_ip->src; sgmp = br_multicast_new_group(cfg->br, &sg_ip); if (IS_ERR(sgmp)) { NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); return PTR_ERR(sgmp); } if (cfg->entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (cfg->filter_mode == MCAST_EXCLUDE) flags |= MDB_PG_FLAGS_BLOCKED; memset(&sg_cfg, 0, sizeof(sg_cfg)); sg_cfg.br = cfg->br; sg_cfg.p = cfg->p; sg_cfg.entry = cfg->entry; sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; sg_cfg.rt_protocol = cfg->rt_protocol; sg_cfg.nlflags = cfg->nlflags; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } static int br_mdb_add_group_src(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct br_mdb_src_entry *src, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; unsigned long now = jiffies; int err; ent = br_multicast_find_group_src(pg, &src->addr); if (!ent) { ent = br_multicast_new_group_src(pg, &src->addr); if (!ent) { NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); return -ENOSPC; } } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } if (cfg->filter_mode == MCAST_INCLUDE && cfg->entry->state == MDB_TEMPORARY) mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); else timer_delete(&ent->timer); /* Install a (S, G) forwarding entry for the source. */ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); if (err) goto err_del_sg; ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; return 0; err_del_sg: __br_multicast_del_group_src(ent); return err; } static void br_mdb_del_group_src(struct net_bridge_port_group *pg, struct br_mdb_src_entry *src) { struct net_bridge_group_src *ent; ent = br_multicast_find_group_src(pg, &src->addr); if (WARN_ON_ONCE(!ent)) return; br_multicast_del_group_src(ent, false); } static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { int i, err; for (i = 0; i < cfg->num_src_entries; i++) { err = br_mdb_add_group_src(cfg, pg, brmctx, &cfg->src_entries[i], extack); if (err) goto err_del_group_srcs; } return 0; err_del_group_srcs: for (i--; i >= 0; i--) br_mdb_del_group_src(pg, &cfg->src_entries[i]); return err; } static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, struct netlink_ext_ack *extack) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int err; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); if (err) goto err_clear_delete; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_DELETE) br_multicast_del_group_src(ent, false); } return 0; err_clear_delete: hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_DELETE; return err; } static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { unsigned long now = jiffies; int err; err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); if (err) return err; pg->flags = flags; pg->filter_mode = cfg->filter_mode; pg->rt_protocol = cfg->rt_protocol; if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) br_multicast_star_g_handle_mode(pg, cfg->filter_mode); return 0; } static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, unsigned char flags, struct netlink_ext_ack *extack) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; int err; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); return -EEXIST; } return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, flags, extack); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, cfg->filter_mode, cfg->rt_protocol, extack); if (unlikely(!p)) return -ENOMEM; err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); if (err) goto err_del_port_group; rcu_assign_pointer(*pp, p); if (!(flags & MDB_PG_FLAGS_PERMANENT) && cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* If we are adding a new EXCLUDE port group (*, G), it needs to be * also added to all (S, G) entries for proper replication. */ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && cfg->filter_mode == MCAST_EXCLUDE) br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); return 0; err_del_port_group: br_multicast_del_port_group(p); return err; } static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; unsigned char flags = 0; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; mp = br_multicast_new_group(br, &group); if (IS_ERR(mp)) return PTR_ERR(mp); /* host join */ if (!port) { if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; } if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; if (br_multicast_is_star_g(&group)) return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); else return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { int ret; spin_lock_bh(&cfg->br->multicast_lock); ret = br_mdb_add_group(cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; } static int br_mdb_config_src_entry_init(struct nlattr *src_entry, struct br_mdb_src_entry *src, __be16 proto, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; int err; err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, br_mdbe_src_list_entry_pol, extack); if (err) return err; if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) return -EINVAL; if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) return -EINVAL; src->addr.proto = proto; nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], nla_len(tb[MDBE_SRCATTR_ADDRESS])); return 0; } static int br_mdb_config_src_list_init(struct nlattr *src_list, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *src_entry; int rem, err; int i = 0; nla_for_each_nested(src_entry, src_list, rem) cfg->num_src_entries++; if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", PG_SRC_ENT_LIMIT - 1); return -EINVAL; } cfg->src_entries = kcalloc(cfg->num_src_entries, sizeof(struct br_mdb_src_entry), GFP_KERNEL); if (!cfg->src_entries) return -ENOMEM; nla_for_each_nested(src_entry, src_list, rem) { err = br_mdb_config_src_entry_init(src_entry, &cfg->src_entries[i], cfg->entry->addr.proto, extack); if (err) goto err_src_entry_init; i++; } return 0; err_src_entry_init: kfree(cfg->src_entries); return err; } static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) { kfree(cfg->src_entries); } static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; int err; err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs, br_mdbe_attrs_pol, extack); if (err) return err; if (mdb_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], cfg->entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); return -EINVAL; } cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); } else { cfg->filter_mode = MCAST_EXCLUDE; } if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); return -EINVAL; } if (!br_multicast_is_star_g(&cfg->group)) { NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); return -EINVAL; } if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); return -EINVAL; } err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], cfg, extack); if (err) return err; } if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); return -EINVAL; } if (mdb_attrs[MDBE_ATTR_RTPROT]) { if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); return -EINVAL; } cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); } return 0; } static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; cfg->nlflags = nlmsg_flags; cfg->br = netdev_priv(dev); if (!netif_running(cfg->br->dev)) { NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); return -EINVAL; } if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); return -EINVAL; } cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); if (cfg->entry->ifindex != cfg->br->dev->ifindex) { struct net_device *pdev; pdev = __dev_get_by_index(net, cfg->entry->ifindex); if (!pdev) { NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; } cfg->p = br_port_get_rtnl(pdev); if (!cfg->p) { NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); return -EINVAL; } if (cfg->p->br != cfg->br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } if (cfg->entry->addr.proto == htons(ETH_P_IP) && ipv4_is_zeronet(cfg->entry->addr.u.ip4)) { NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed"); return -EINVAL; } if (tb[MDBA_SET_ENTRY_ATTRS]) return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg, extack); else __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL); return 0; } static void br_mdb_config_fini(struct br_mdb_config *cfg) { br_mdb_config_src_list_fini(cfg); } int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack); if (err) return err; err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); goto out; } vg = nbp_vlan_group(cfg.p); } else { vg = br_vlan_group(cfg.br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_add(&cfg, extack); if (err) break; } } else { err = __br_mdb_add(&cfg, extack); } out: br_mdb_config_fini(&cfg); return err; } static int __br_mdb_del(const struct br_mdb_config *cfg) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge *br = cfg->br; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip ip = cfg->group; int err = -EINVAL; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) goto unlock; /* host leave */ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) { br_multicast_host_leave(mp, false); err = 0; br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); goto unlock; } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; br_multicast_del_pg(mp, p, pp); err = 0; break; } unlock: spin_unlock_bh(&br->multicast_lock); return err; } int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct br_mdb_config cfg; int err; err = br_mdb_config_init(&cfg, dev, tb, 0, extack); if (err) return err; if (cfg.p) vg = nbp_vlan_group(cfg.p); else vg = br_vlan_group(cfg.br); /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; err = __br_mdb_del(&cfg); } } else { err = __br_mdb_del(&cfg); } br_mdb_config_fini(&cfg); return err; } struct br_mdb_flush_desc { u32 port_ifindex; u16 vid; u8 rt_protocol; u8 state; u8 state_mask; }; static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT), }; static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; desc->port_ifindex = entry->ifindex; desc->vid = entry->vid; desc->state = entry->state; if (!tb[MDBA_SET_ENTRY_ATTRS]) return 0; err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_SET_ENTRY_ATTRS], br_mdbe_attrs_del_bulk_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_STATE_MASK]) desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]); if (mdbe_attrs[MDBE_ATTR_RTPROT]) desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]); return 0; } static void br_mdb_flush_host(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { u8 state; if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex) return; if (desc->rt_protocol) return; state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) return; br_multicast_host_leave(mp, true); if (!mp->ports && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_mdb_flush_pgs(struct net_bridge *br, struct net_bridge_mdb_entry *mp, const struct br_mdb_flush_desc *desc) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) { u8 state; if (desc->port_ifindex && desc->port_ifindex != p->key.port->dev->ifindex) { pp = &p->next; continue; } if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) { pp = &p->next; continue; } state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0; if (desc->state_mask && (state & desc->state_mask) != desc->state) { pp = &p->next; continue; } br_multicast_del_pg(mp, p, pp); } } static void br_mdb_flush(struct net_bridge *br, const struct br_mdb_flush_desc *desc) { struct net_bridge_mdb_entry *mp; spin_lock_bh(&br->multicast_lock); /* Safe variant is not needed because entries are removed from the list * upon group timer expiration or bridge deletion. */ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { if (desc->vid && desc->vid != mp->addr.vid) continue; br_mdb_flush_host(br, mp, desc); br_mdb_flush_pgs(br, mp, desc); } spin_unlock_bh(&br->multicast_lock); } int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct br_mdb_flush_desc desc = {}; int err; err = br_mdb_flush_desc_init(&desc, tb, extack); if (err) return err; br_mdb_flush(br, &desc); return 0; } static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), }; static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[], struct br_ip *group, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]); struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1]; int err; if (!tb[MDBA_GET_ENTRY_ATTRS]) { __mdb_entry_to_br_ip(entry, group, NULL); return 0; } err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol, extack); if (err) return err; if (mdbe_attrs[MDBE_ATTR_SOURCE] && !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE], entry->addr.proto, extack)) return -EINVAL; __mdb_entry_to_br_ip(entry, group, mdbe_attrs); return 0; } static struct sk_buff * br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp) { struct net_bridge_port_group *pg; size_t nlmsg_size; nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) + /* MDBA_MDB */ nla_total_size(0) + /* MDBA_MDB_ENTRY */ nla_total_size(0); if (mp->host_joined) nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL); for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg); return nlmsg_new(nlmsg_size, GFP_ATOMIC); } static int br_mdb_get_reply_fill(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, u32 portid, u32 seq) { struct nlattr *mdb_nest, *mdb_entry_nest; struct net_bridge_port_group *pg; struct br_port_msg *bpm; struct nlmsghdr *nlh; int err; nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0); if (!nlh) return -EMSGSIZE; bpm = nlmsg_data(nlh); memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = mp->br->dev->ifindex; mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB); if (!mdb_nest) { err = -EMSGSIZE; goto cancel; } mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!mdb_entry_nest) { err = -EMSGSIZE; goto cancel; } if (mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) goto cancel; } for (pg = mlock_dereference(mp->ports, mp->br); pg; pg = mlock_dereference(pg->next, mp->br)) { err = __mdb_fill_info(skb, mp, pg); if (err) goto cancel; } nla_nest_end(skb, mdb_entry_nest); nla_nest_end(skb, mdb_nest); nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return err; } int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct sk_buff *skb; struct br_ip group; int err; err = br_mdb_get_parse(dev, tb, &group, extack); if (err) return err; /* Hold the multicast lock to ensure that the MDB entry does not change * between the time the reply size is determined and when the reply is * filled in. */ spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &group); if (!mp || (!mp->ports && !mp->host_joined)) { NL_SET_ERR_MSG_MOD(extack, "MDB entry not found"); err = -ENOENT; goto unlock; } skb = br_mdb_get_reply_alloc(mp); if (!skb) { err = -ENOMEM; goto unlock; } err = br_mdb_get_reply_fill(skb, mp, portid, seq); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply"); goto free; } spin_unlock_bh(&br->multicast_lock); return rtnl_unicast(skb, dev_net(dev), portid); free: kfree_skb(skb); unlock: spin_unlock_bh(&br->multicast_lock); return err; }
155 155 120 120 75 8 71 3 4 4 7 5 3 2 4 4 4 7 3 7 1 1 58 58 7 14 7 114 113 2 218 217 1 13 62 83 103 6 13 13 3 7 13 13 127 64 110 27 98 5 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 // SPDX-License-Identifier: GPL-2.0-only /* * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of CUBIC TCP in * Sangtae Ha, Injong Rhee and Lisong Xu, * "CUBIC: A New TCP-Friendly High-Speed TCP Variant" * in ACM SIGOPS Operating System Review, July 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf * * CUBIC integrates a new slow start algorithm, called HyStart. * The details of HyStart are presented in * Sangtae Ha and Injong Rhee, * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008. * Available from: * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf * * All testing results are available from: * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing * * Unless CUBIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ /* Two methods of hybrid slow start */ #define HYSTART_ACK_TRAIN 0x1 #define HYSTART_DELAY 0x2 /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 #define HYSTART_DELAY_MIN (4000U) /* 4 ms */ #define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh __read_mostly; static int bic_scale __read_mostly = 41; static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; static u64 cube_factor __read_mostly; /* Note parameters that are used for precomputing scale factors are read-only */ module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(bic_scale, int, 0444); MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); module_param(hystart, int, 0644); MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); module_param(hystart_detect, int, 0644); MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); module_param(hystart_ack_delta_us, int, 0644); MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; static inline void bictcp_reset(struct bictcp *ca) { memset(ca, 0, offsetof(struct bictcp, unused)); ca->found = 0; } static inline u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; ca->curr_rtt = ~0U; ca->sample_cnt = 0; } __bpf_kfunc static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (hystart) bictcp_hystart_reset(sk); if (!hystart && initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } __bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); u32 now = tcp_jiffies32; s32 delta; delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; if (after(ca->epoch_start, now)) ca->epoch_start = now; } return; } } /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% */ static u32 cubic_root(u64 a) { u32 x, b, shift; /* * cbrt(x) MSB values for x MSB values in [0..63]. * Precomputed then refined by hand - Willy Tarreau * * For x in [0..63], * v = cbrt(x << 18) - 1 * cbrt(x) = (v[x] + 10) >> 6 */ static const u8 v[] = { /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, }; b = fls64(a); if (b < 7) { /* a in [0..63] */ return ((u32)v[(u32)a] + 35) >> 6; } b = ((b * 84) >> 8) - 1; shift = (a >> (b * 3)); x = ((u32)(((u32)v[shift] + 10) << b)) >> 6; /* * Newton-Raphson iteration * 2 * x = ( 2 * x + a / x ) / 3 * k+1 k k */ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1))); x = ((x * 341) >> 10); return x; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; /* The CUBIC function can update ca->cnt at most once per jiffy. * On all cwnd reduction events, ca->epoch_start is set to 0, * which will force a recalculation of ca->cnt. */ if (ca->epoch_start && tcp_jiffies32 == ca->last_time) goto tcp_friendliness; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) { ca->epoch_start = tcp_jiffies32; /* record beginning */ ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { ca->bic_K = 0; ca->bic_origin_point = cwnd; } else { /* Compute new K based on * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ ca->bic_K = cubic_root(cube_factor * (ca->last_max_cwnd - cwnd)); ca->bic_origin_point = ca->last_max_cwnd; } } /* cubic function - calc*/ /* calculate c * time^3 / rtt, * while considering overflow in calculation of time^3 * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers * (so all divisions are done by using 32 bit) * also NOTE the unit of those veriables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 * rtt = (srtt >> 3) / HZ * !!! The following code does not have overflow problems, * if the cwnd < 1 million packets !!! */ t = (s32)(tcp_jiffies32 - ca->epoch_start); t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; else offs = t - ca->bic_K; /* c/rtt * (t-K)^3 */ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); if (t < ca->bic_K) /* below origin*/ bic_target = ca->bic_origin_point - delta; else /* above origin*/ bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ if (bic_target > cwnd) { ca->cnt = cwnd / (bic_target - cwnd); } else { ca->cnt = 100 * cwnd; /* very small increment*/ } /* * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown. */ if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; delta = (cwnd * scale) >> 3; while (ca->ack_cnt > delta) { /* update tcp cwnd */ ca->ack_cnt -= delta; ca->tcp_cwnd++; } if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ delta = ca->tcp_cwnd - cwnd; max_cnt = cwnd / delta; if (ca->cnt > max_cnt) ca->cnt = max_cnt; } } /* The maximum rate of cwnd increase CUBIC allows is 1 packet per * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. */ ca->cnt = max(ca->cnt, 2U); } __bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp), acked); tcp_cong_avoid_ai(tp, ca->cnt, acked); } __bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } __bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); bictcp_hystart_reset(sk); } } /* Account for TSO/GRO delays. * Otherwise short RTT flows could get too small ssthresh, since during * slow start we begin with small TSO packets and ca->delay_min would * not account for long aggregation delay when TSO packets get bigger. * Ideally even with a very small RTT we would like to have at least one * TSO packet being sent and received by GRO, and another one in qdisc layer. * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ static u32 hystart_ack_delay(const struct sock *sk) { unsigned long rate; rate = READ_ONCE(sk->sk_pacing_rate); if (!rate) return 0; return min_t(u64, USEC_PER_MSEC, div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate)); } static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 threshold; if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); /* hystart triggers when cwnd is larger than some threshold */ if (tcp_snd_cwnd(tp) < hystart_low_window) return; if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; threshold = ca->delay_min + hystart_ack_delay(sk); /* Hystart ack train triggers if we get ack past * ca->delay_min/2. * Pacing might have delayed packets up to RTT/2 * during slow start. */ if (sk->sk_pacing_status == SK_PACING_NONE) threshold >>= 1; if ((s32)(now - ca->round_start) > threshold) { ca->found = 1; pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", now - ca->round_start, threshold, ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp)); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->curr_rtt > delay) ca->curr_rtt = delay; if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } } __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; /* Some calls are for duplicates without timetamps */ if (sample->rtt_us < 0) return; /* Discard delay samples right after fast recovery */ if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; delay = sample->rtt_us; if (delay == 0) delay = 1; /* first time call or link delay decreases */ if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; if (!ca->found && tcp_in_slow_start(tp) && hystart) hystart_update(sk, delay); } static struct tcp_congestion_ops cubictcp __read_mostly = { .init = cubictcp_init, .ssthresh = cubictcp_recalc_ssthresh, .cong_avoid = cubictcp_cong_avoid, .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = cubictcp_cwnd_event, .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_cubic_check_kfunc_ids, }; static int __init cubictcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); /* Precompute a bunch of the scaling factors that are used per-packet * based on SRTT of 100ms */ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 / (BICTCP_BETA_SCALE - beta); cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 * so K = cubic_root( (wmax-cwnd)*rtt/c ) * the unit of K is bictcp_HZ=2^10, not HZ * * c = bic_scale >> 10 * rtt = 100ms * * the following code has been designed and tested for * cwnd < 1 million packets * RTT < 100 seconds * HZ < 1,000,00 (corresponding to 10 nano-second) */ /* 1/c * 2^2*bictcp_HZ * srtt */ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&cubictcp); } static void __exit cubictcp_unregister(void) { tcp_unregister_congestion_control(&cubictcp); } module_init(cubictcp_register); module_exit(cubictcp_unregister); MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CUBIC TCP"); MODULE_VERSION("2.3");
1585 1585 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLER__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); } static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline void *pfn_to_kaddr(unsigned long pfn) { return __va(pfn << PAGE_SHIFT); } static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */
150 8 147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SHA-1 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <linux/static_call.h> DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic); #define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \ asmlinkage void asm_fn(struct sha1_block_state *state, \ const u8 *data, size_t nblocks); \ static void c_fn(struct sha1_block_state *state, \ const u8 *data, size_t nblocks) \ { \ if (likely(irq_fpu_usable())) { \ kernel_fpu_begin(); \ asm_fn(state, data, nblocks); \ kernel_fpu_end(); \ } else { \ sha1_blocks_generic(state, data, nblocks); \ } \ } DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3); DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx); DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform); #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ asmlinkage void sha1_transform_avx2(struct sha1_block_state *state, const u8 *data, size_t nblocks); static void sha1_blocks_avx2(struct sha1_block_state *state, const u8 *data, size_t nblocks) { if (likely(irq_fpu_usable())) { kernel_fpu_begin(); /* Select the optimal transform based on the number of blocks */ if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE) sha1_transform_avx2(state, data, nblocks); else sha1_transform_avx(state, data, nblocks); kernel_fpu_end(); } else { sha1_blocks_generic(state, data, nblocks); } } static void sha1_blocks(struct sha1_block_state *state, const u8 *data, size_t nblocks) { static_call(sha1_blocks_x86)(state, data, nblocks); } #define sha1_mod_init_arch sha1_mod_init_arch static void sha1_mod_init_arch(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha1_blocks_x86, sha1_blocks_ni); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha1_blocks_x86, sha1_blocks_avx2); else static_call_update(sha1_blocks_x86, sha1_blocks_avx); } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { static_call_update(sha1_blocks_x86, sha1_blocks_ssse3); } }
10 9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 9 10 10 9 10 10 9 23 23 23 21 11 22 7 23 23 9 9 10 10 10 10 10 9 9 10 10 10 10 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Hugh Dickins <hughd@google.com> * Zheng Liu <wenqing.lz@taobao.com> * * Ext4 extents status tree core functions. */ #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "ext4.h" #include <trace/events/ext4.h> /* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent * status in order to solve some problems that we have met * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called * delay extent tree at the first commit. But for better understand * what it does, it has been rename to extent status tree. * * Step1: * Currently the first step has been done. All delayed extents are * tracked in the tree. It maintains the delayed extent when a delayed * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. * * Step2: * In this step all extent status are tracked by extent status tree. * Thus, we can first try to lookup a block mapping in this tree before * finding it in extent tree. Hence, single extent cache can be removed * because extent status tree can do a better job. Extents in status * tree are loaded on-demand. Therefore, the extent status tree may not * contain all of the extents in a file. Meanwhile we define a shrinker * to reclaim memory from extent status tree because fragmented extent * tree will make status tree cost too much memory. written/unwritten/- * hole extents in the tree will be reclaimed by this shrinker when we * are under high memory pressure. Delayed extents will not be * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* * Extent status tree implementation for ext4. * * * ========================================================================== * Extent status tree tracks all extent status. * * 1. Why we need to implement extent status tree? * * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a * block or a range of blocks are belonged to a delayed extent. * * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * * -- SEEK_HOLE/DATA * SEEK_HOLE/DATA has the same problem as FIEMAP. * * -- bigalloc * bigalloc looks up page cache to figure out if a block is * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time consuming. * * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or * not by searching the extent tree. * * * ========================================================================== * 2. Ext4 extent status tree impelmentation * * -- extent * A extent is a range of blocks which are contiguous logically and * physically. Unlike extent in extent tree, this extent in ext4 is * a in-memory struct, there is no corresponding on-disk data. There * is no limit on length of extent, so an extent can contain as many * blocks as they are contiguous logically and physically. * * -- extent status tree * Every inode has an extent status tree and all allocation blocks * are added to the tree with different status. The extent in the * tree are ordered by logical block no. * * -- operations on a extent status tree * There are three important operations on a delayed extent tree: find * next extent, adding a extent(a range of blocks) and removing a extent. * * -- race on a extent status tree * Extent status tree is protected by inode->i_es_lock. * * -- memory consumption * Fragmented extent tree will make extent status tree cost too much * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * * ========================================================================== * 3. Assurance of Ext4 extent status tree consistency * * When mapping blocks, Ext4 queries the extent status tree first and should * always trusts that the extent status tree is consistent and up to date. * Therefore, it is important to adheres to the following rules when createing, * modifying and removing extents. * * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, * the extent information should always be processed through the extent * status tree instead of being organized manually through the on-disk * extent tree. * * 2. When updating the extent tree, Ext4 should acquire the i_data_sem * exclusively and update the extent status tree atomically. If the extents * to be modified are large enough to exceed the range that a single * i_data_sem can process (as ext4_datasem_ensure_credits() may drop * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() * does): * * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against page faults, as well as reads and writes that may * concurrently modify the extent status tree. * b) Evict all page cache in the affected range and recommend rebuilding * or dropping the extent status tree after modifying the on-disk * extent tree. This ensures exclusion against concurrent writebacks * that do not hold those locks but only holds a folio lock. * * 3. Based on the rules above, when querying block mappings, Ext4 should at * least hold the i_rwsem or invalidate_lock or folio lock(s) for the * specified querying range. * * ========================================================================== * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. * * -- gain * 2. Code is much simpler, more readable, more maintainable and * more efficient. * * * ========================================================================== * 5. TODO list * * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, int *reserved, struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, struct pending_reservation **prealloc); int __init ext4_init_es(void) { ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_es(void) { kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) { tree->root = RB_ROOT; tree->cache_es = NULL; } #ifdef ES_DEBUG__ static void ext4_es_print_tree(struct inode *inode) { struct ext4_es_tree *tree; struct rb_node *node; printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); tree = &EXT4_I(inode)->i_es_tree; node = rb_first(&tree->root); while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); } #else #define ext4_es_print_tree(inode) #endif static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { BUG_ON(es->es_lblk + es->es_len < es->es_lblk); return es->es_lblk + es->es_len - 1; } /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); if (lblk < es->es_lblk) node = node->rb_left; else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } if (es && lblk < es->es_lblk) return es; if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; } return NULL; } /* * ext4_es_find_extent_range - find extent with specified status within block * range or next extent following block range in * extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * @es - extent found, if any * * Find the first extent within the block range specified by @lblk and @end * in the extents status tree that satisfies @matching_fn. If a match * is found, it's returned in @es. If not, and a matching extent is found * beyond the block range, it's returned in @es. If no match is found, an * extent is returned in @es whose es_lblk, es_len, and es_pblk components * are 0. */ static void __es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; WARN_ON(es == NULL); WARN_ON(end < lblk); tree = &EXT4_I(inode)->i_es_tree; /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } es1 = __es_tree_search(&tree->root, lblk); out: if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } if (matching_fn(es1)) break; } } if (es1 && matching_fn(es1)) { WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } } /* * Locking for __es_find_extent_range() for external use */ void ext4_es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { es->es_lblk = es->es_len = es->es_pblk = 0; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; trace_ext4_es_find_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); __es_find_extent_range(inode, matching_fn, lblk, end, es); read_unlock(&EXT4_I(inode)->i_es_lock); trace_ext4_es_find_extent_range_exit(inode, es); } /* * __es_scan_range - search block range for block with specified status * in extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * * Returns true if at least one block in the specified block range satisfies * the criterion specified by @matching_fn, and false if not. If at least * one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock. */ static bool __es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t start, ext4_lblk_t end) { struct extent_status es; __es_find_extent_range(inode, matching_fn, start, end, &es); if (es.es_len == 0) return false; /* no matching extent in the tree */ else if (es.es_lblk <= start && start < es.es_lblk + es.es_len) return true; else if (start <= es.es_lblk && es.es_lblk <= end) return true; else return false; } /* * Locking for __es_scan_range() for external use */ bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end) { bool ret; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return false; read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); return