| 6 1 8 8 9 8 1 9 9 52 1 1 48 2 17 11 8 1 6 79 7 1 6 106 3 1 15 85 1 6 2 16 14 55 61 9 6 9 2 11 9 6 73 3 81 81 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/filter.h> #include <linux/bpf.h> #include <net/netlink.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_bpf.h> #include <net/tc_act/tc_bpf.h> #include <net/tc_wrapper.h> #define ACT_BPF_NAME_LEN 256 struct tcf_bpf_cfg { struct bpf_prog *filter; struct sock_filter *bpf_ops; const char *bpf_name; u16 bpf_num_ops; bool is_ebpf; }; static struct tc_action_ops act_bpf_ops; TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); filter_res = bpf_prog_run_data_pointers(filter, skb); __skb_pull(skb, skb->mac_len); } else { filter_res = bpf_prog_run_data_pointers(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); /* A BPF program may overwrite the default action opcode. * Similarly as in cls_bpf, if filter_res == -1 we use the * default action specified from tc. * * In case a different well-known TC_ACT opcode has been * returned, it will overwrite the default one. * * For everything else that is unknown, TC_ACT_UNSPEC is * returned. */ switch (filter_res) { case TC_ACT_PIPE: case TC_ACT_RECLASSIFY: case TC_ACT_OK: case TC_ACT_REDIRECT: action = filter_res; break; case TC_ACT_SHOT: action = filter_res; qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; break; default: action = TC_ACT_UNSPEC; break; } return action; } static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog) { return !prog->bpf_ops; } static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops * sizeof(struct sock_filter)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); return 0; } static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; if (nla_put_u32(skb, TCA_ACT_BPF_ID, prog->filter->aux->id)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) { unsigned char *tp = skb_tail_pointer(skb); struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, }; struct tcf_t tm; int ret; spin_lock_bh(&prog->tcf_lock); opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (tcf_bpf_is_ebpf(prog)) ret = tcf_bpf_dump_ebpf_info(prog, skb); else ret = tcf_bpf_dump_bpf_info(prog, skb); if (ret) goto nla_put_failure; tcf_tm_dump(&tm, &prog->tcf_tm); if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, TCA_ACT_BPF_PAD)) goto nla_put_failure; spin_unlock_bh(&prog->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = { [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) }, [TCA_ACT_BPF_FD] = { .type = NLA_U32 }, [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN }, [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; int ret; bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) return -EINVAL; bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; ret = bpf_prog_create(&fp, &fprog_tmp); if (ret < 0) { kfree(bpf_ops); return ret; } cfg->bpf_ops = bpf_ops; cfg->bpf_num_ops = bpf_num_ops; cfg->filter = fp; cfg->is_ebpf = false; return 0; } static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) { struct bpf_prog *fp; char *name = NULL; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_ACT); if (IS_ERR(fp)) return PTR_ERR(fp); if (tb[TCA_ACT_BPF_NAME]) { name = nla_memdup(tb[TCA_ACT_BPF_NAME], GFP_KERNEL); if (!name) { bpf_prog_put(fp); return -ENOMEM; } } cfg->bpf_name = name; cfg->filter = fp; cfg->is_ebpf = true; return 0; } static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg) { struct bpf_prog *filter = cfg->filter; if (filter) { if (cfg->is_ebpf) bpf_prog_put(filter); else bpf_prog_destroy(filter); } kfree(cfg->bpf_ops); kfree(cfg->bpf_name); } static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either * with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; } static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; int ret, res = 0; u32 index; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, act, bind); if (!ret) { ret = tcf_idr_create(tn, index, est, act, &act_bpf_ops, bind, true, flags); if (ret < 0) { tcf_idr_cleanup(tn, index); return ret; } res = ACT_P_CREATED; } else if (ret > 0) { /* Don't override defaults. */ if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); return -EEXIST; } } else { return ret; } ret = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (ret < 0) goto release_idr; is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; is_ebpf = tb[TCA_ACT_BPF_FD]; if (is_bpf == is_ebpf) { ret = -EINVAL; goto put_chain; } memset(&cfg, 0, sizeof(cfg)); ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : tcf_bpf_init_from_efd(tb, &cfg); if (ret < 0) goto put_chain; prog = to_bpf(*act); spin_lock_bh(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; prog->bpf_name = cfg.bpf_name; if (cfg.bpf_num_ops) prog->bpf_num_ops = cfg.bpf_num_ops; goto_ch = tcf_action_set_ctrlact(*act, parm->action, goto_ch); rcu_assign_pointer(prog->filter, cfg.filter); spin_unlock_bh(&prog->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (res != ACT_P_CREATED) { /* make sure the program being replaced is no longer executing */ synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); } return res; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*act, bind); return ret; } static void tcf_bpf_cleanup(struct tc_action *act) { struct tcf_bpf_cfg tmp; tcf_bpf_prog_fill_cfg(to_bpf(act), &tmp); tcf_bpf_cfg_cleanup(&tmp); } static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, .size = sizeof(struct tcf_bpf), }; MODULE_ALIAS_NET_ACT("bpf"); static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); return tc_action_net_init(net, tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_bpf_ops.net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, .exit_batch = bpf_exit_net, .id = &act_bpf_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init bpf_init_module(void) { return tcf_register_action(&act_bpf_ops, &bpf_net_ops); } static void __exit bpf_cleanup_module(void) { tcf_unregister_action(&act_bpf_ops, &bpf_net_ops); } module_init(bpf_init_module); module_exit(bpf_cleanup_module); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("TC BPF based action"); MODULE_LICENSE("GPL v2"); |
| 89 86 87 12 70 7 1 1 1 1 8 5 5 4 77 77 77 5 66 5 1 1 81 7 3 3 93 93 82 89 14 3 1 76 66 7 1 1 4 12 2 75 10 1 7 5 4 4 2 66 15 14 66 66 1 66 1 6 6 11 72 72 8 74 73 3 1 2 16 16 14 3 3 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif /* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map, const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { nf_ct_l4proto_log_invalid(skb, ct, state, "%s failed. chunk num %d, type %d, len %d flag %d\n", __func__, count, sch->type, sch->length, flag); return 1; } if (map) set_bit(sch->type, map); } return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; switch (chunk_type) { case SCTP_CID_INIT: i = 0; break; case SCTP_CID_INIT_ACK: i = 1; break; case SCTP_CID_ABORT: i = 2; break; case SCTP_CID_SHUTDOWN: i = 3; break; case SCTP_CID_SHUTDOWN_ACK: i = 4; break; case SCTP_CID_ERROR: i = 5; break; case SCTP_CID_COOKIE_ECHO: i = 6; break; case SCTP_CID_COOKIE_ACK: i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: i = 8; break; case SCTP_CID_HEARTBEAT: i = 9; break; case SCTP_CID_HEARTBEAT_ACK: i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type %d, Will stay in %s\n", chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { nf_ct_l4proto_log_invalid(skb, ct, state, "verification tag check failed %x vs %x for dir %d", sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { nf_ct_l4proto_log_invalid(skb, ct, state, "Invalid, old_state %d, dir %d, type %d", old_state, dir, sch->type); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 12 12 1 9 3 8 6 2 34 24 27 32 19 13 32 5 15 6 6 6 6 3 6 6 3 2 2 1 4 3 3 2 16 17 4 16 14 14 1 3 6 4 13 1 2 2 1 4 7 2 5 5 5 3 2 5 4 3 5 4 13 1 12 11 1 10 10 10 1 9 9 7 2 7 9 33 33 29 7 4 1 2 1 11 11 5 3 2 100 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 | // SPDX-License-Identifier: GPL-2.0 /* * Provide access to virtual console memory. * /dev/vcs: the screen as it is being viewed right now (possibly scrolled) * /dev/vcsN: the screen of /dev/ttyN (1 <= N <= 63) * [minor: N] * * /dev/vcsaN: idem, but including attributes, and prefixed with * the 4 bytes lines,columns,x,y (as screendump used to give). * Attribute/character pair is in native endianity. * [minor: N+128] * * /dev/vcsuN: similar to /dev/vcsaN but using 4-byte unicode values * instead of 1-byte screen glyph values. * [minor: N+64] * * /dev/vcsuaN: same idea as /dev/vcsaN for unicode (not yet implemented). * * This replaces screendump and part of selection, so that the system * administrator can control access using file system permissions. * * aeb@cwi.nl - efter Friedas begravelse - 950211 * * machek@k332.feld.cvut.cz - modified not to send characters to wrong console * - fixed some fatal off-by-one bugs (0-- no longer == -1 -> looping and looping and looping...) * - making it shorter - scr_readw are macros which expand in PRETTY long code */ #include <linux/kernel.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/tty.h> #include <linux/interrupt.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kbd_kern.h> #include <linux/console.h> #include <linux/device.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/signal.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/uaccess.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #define HEADER_SIZE 4u #define CON_BUF_SIZE (IS_ENABLED(CONFIG_BASE_SMALL) ? 256 : PAGE_SIZE) DEFINE_FREE(free_page_ptr, void *, if (_T) free_page((unsigned long)_T)); /* * Our minor space: * * 0 ... 63 glyph mode without attributes * 64 ... 127 unicode mode without attributes * 128 ... 191 glyph mode with attributes * 192 ... 255 unused (reserved for unicode with attributes) * * This relies on MAX_NR_CONSOLES being <= 63, meaning 63 actual consoles * with minors 0, 64, 128 and 192 being proxies for the foreground console. */ #if MAX_NR_CONSOLES > 63 #warning "/dev/vcs* devices may not accommodate more than 63 consoles" #endif #define console(inode) (iminor(inode) & 63) #define use_unicode(inode) (iminor(inode) & 64) #define use_attributes(inode) (iminor(inode) & 128) struct vcs_poll_data { struct notifier_block notifier; unsigned int cons_num; int event; wait_queue_head_t waitq; struct fasync_struct *fasync; }; static int vcs_notifier(struct notifier_block *nb, unsigned long code, void *_param) { struct vt_notifier_param *param = _param; struct vc_data *vc = param->vc; struct vcs_poll_data *poll = container_of(nb, struct vcs_poll_data, notifier); int currcons = poll->cons_num; int fa_band; switch (code) { case VT_UPDATE: fa_band = POLL_PRI; break; case VT_DEALLOCATE: fa_band = POLL_HUP; break; default: return NOTIFY_DONE; } if (currcons == 0) currcons = fg_console; else currcons--; if (currcons != vc->vc_num) return NOTIFY_DONE; poll->event = code; wake_up_interruptible(&poll->waitq); kill_fasync(&poll->fasync, SIGIO, fa_band); return NOTIFY_OK; } static void vcs_poll_data_free(struct vcs_poll_data *poll) { unregister_vt_notifier(&poll->notifier); kfree(poll); } static struct vcs_poll_data * vcs_poll_data_get(struct file *file) { struct vcs_poll_data *poll = file->private_data, *kill = NULL; if (poll) return poll; poll = kzalloc(sizeof(*poll), GFP_KERNEL); if (!poll) return NULL; poll->cons_num = console(file_inode(file)); init_waitqueue_head(&poll->waitq); poll->notifier.notifier_call = vcs_notifier; /* * In order not to lose any update event, we must pretend one might * have occurred before we have a chance to register our notifier. * This is also how user space has come to detect which kernels * support POLLPRI on /dev/vcs* devices i.e. using poll() with * POLLPRI and a zero timeout. */ poll->event = VT_UPDATE; if (register_vt_notifier(&poll->notifier) != 0) { kfree(poll); return NULL; } /* * This code may be called either through ->poll() or ->fasync(). * If we have two threads using the same file descriptor, they could * both enter this function, both notice that the structure hasn't * been allocated yet and go ahead allocating it in parallel, but * only one of them must survive and be shared otherwise we'd leak * memory with a dangling notifier callback. */ spin_lock(&file->f_lock); if (!file->private_data) { file->private_data = poll; } else { /* someone else raced ahead of us */ kill = poll; poll = file->private_data; } spin_unlock(&file->f_lock); if (kill) vcs_poll_data_free(kill); return poll; } /** * vcs_vc - return VC for @inode * @inode: inode for which to return a VC * @viewed: returns whether this console is currently foreground (viewed) * * Must be called with console_lock. */ static struct vc_data *vcs_vc(struct inode *inode, bool *viewed) { unsigned int currcons = console(inode); WARN_CONSOLE_UNLOCKED(); if (currcons == 0) { currcons = fg_console; if (viewed) *viewed = true; } else { currcons--; if (viewed) *viewed = false; } return vc_cons[currcons].d; } /** * vcs_size - return size for a VC in @vc * @vc: which VC * @attr: does it use attributes? * @unicode: is it unicode? * * Must be called with console_lock. */ static int vcs_size(const struct vc_data *vc, bool attr, bool unicode) { int size; WARN_CONSOLE_UNLOCKED(); size = vc->vc_rows * vc->vc_cols; if (attr) { if (unicode) return -EOPNOTSUPP; size = 2 * size + HEADER_SIZE; } else if (unicode) size *= 4; return size; } static loff_t vcs_lseek(struct file *file, loff_t offset, int orig) { struct inode *inode = file_inode(file); struct vc_data *vc; int size; scoped_guard(console_lock) { vc = vcs_vc(inode, NULL); if (!vc) return -ENXIO; size = vcs_size(vc, use_attributes(inode), use_unicode(inode)); } if (size < 0) return size; return fixed_size_llseek(file, offset, orig, size); } static int vcs_read_buf_uni(struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed) { unsigned int nr, row, col, maxcol = vc->vc_cols; int ret; ret = vc_uniscr_check(vc); if (ret) return ret; pos /= 4; row = pos / maxcol; col = pos % maxcol; nr = maxcol - col; do { if (nr > count / 4) nr = count / 4; vc_uniscr_copy_line(vc, con_buf, viewed, row, col, nr); con_buf += nr * 4; count -= nr * 4; row++; col = 0; nr = maxcol; } while (count); return 0; } static void vcs_read_buf_noattr(const struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed) { u16 *org; unsigned int col, maxcol = vc->vc_cols; org = screen_pos(vc, pos, viewed); col = pos % maxcol; pos += maxcol - col; while (count-- > 0) { *con_buf++ = (vcs_scr_readw(vc, org++) & 0xff); if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } } static unsigned int vcs_read_buf(const struct vc_data *vc, char *con_buf, unsigned int pos, unsigned int count, bool viewed, unsigned int *skip) { u16 *org, *con_buf16; unsigned int col, maxcol = vc->vc_cols; unsigned int filled = count; if (pos < HEADER_SIZE) { /* clamp header values if they don't fit */ con_buf[0] = min(vc->vc_rows, 0xFFu); con_buf[1] = min(vc->vc_cols, 0xFFu); getconsxy(vc, con_buf + 2); *skip += pos; count += pos; if (count > CON_BUF_SIZE) { count = CON_BUF_SIZE; filled = count - pos; } /* Advance state pointers and move on. */ count -= min(HEADER_SIZE, count); pos = HEADER_SIZE; con_buf += HEADER_SIZE; /* If count >= 0, then pos is even... */ } else if (pos & 1) { /* * Skip first byte for output if start address is odd. Update * region sizes up/down depending on free space in buffer. */ (*skip)++; if (count < CON_BUF_SIZE) count++; else filled--; } if (!count) return filled; pos -= HEADER_SIZE; pos /= 2; col = pos % maxcol; org = screen_pos(vc, pos, viewed); pos += maxcol - col; /* * Buffer has even length, so we can always copy character + attribute. * We do not copy last byte to userspace if count is odd. */ count = (count + 1) / 2; con_buf16 = (u16 *)con_buf; while (count) { *con_buf16++ = vcs_scr_readw(vc, org++); count--; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } return filled; } static ssize_t vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct vc_data *vc; struct vcs_poll_data *poll; unsigned int read; ssize_t ret; loff_t pos; bool viewed, attr, uni_mode; char *con_buf __free(free_page_ptr) = (char *)__get_free_page(GFP_KERNEL); if (!con_buf) return -ENOMEM; pos = *ppos; /* Select the proper current console and verify * sanity of the situation under the console lock. */ guard(console_lock)(); uni_mode = use_unicode(inode); attr = use_attributes(inode); if (pos < 0) return -EINVAL; /* we enforce 32-bit alignment for pos and count in unicode mode */ if (uni_mode && (pos | count) & 3) return -EINVAL; poll = file->private_data; if (count && poll) poll->event = 0; read = 0; ret = 0; while (count) { unsigned int this_round, skip = 0; int size; vc = vcs_vc(inode, &viewed); if (!vc) { ret = -ENXIO; break; } /* Check whether we are above size each round, * as copy_to_user at the end of this loop * could sleep. */ size = vcs_size(vc, attr, uni_mode); if (size < 0) { ret = size; break; } if (pos >= size) break; if (count > size - pos) count = size - pos; this_round = count; if (this_round > CON_BUF_SIZE) this_round = CON_BUF_SIZE; /* Perform the whole read into the local con_buf. * Then we can drop the console spinlock and safely * attempt to move it to userspace. */ if (uni_mode) { ret = vcs_read_buf_uni(vc, con_buf, pos, this_round, viewed); if (ret) break; } else if (!attr) { vcs_read_buf_noattr(vc, con_buf, pos, this_round, viewed); } else { this_round = vcs_read_buf(vc, con_buf, pos, this_round, viewed, &skip); } /* Finally, release the console semaphore while we push * all the data to userspace from our temporary buffer. * * AKPM: Even though it's a semaphore, we should drop it because * the pagefault handling code may want to call printk(). */ console_unlock(); ret = copy_to_user(buf, con_buf + skip, this_round); console_lock(); if (ret) { read += this_round - ret; ret = -EFAULT; break; } buf += this_round; pos += this_round; read += this_round; count -= this_round; } *ppos += read; if (read) return read; return ret; } static u16 *vcs_write_buf_noattr(struct vc_data *vc, const char *con_buf, unsigned int pos, unsigned int count, bool viewed, u16 **org0) { u16 *org; unsigned int col, maxcol = vc->vc_cols; *org0 = org = screen_pos(vc, pos, viewed); col = pos % maxcol; pos += maxcol - col; while (count > 0) { unsigned char c = *con_buf++; count--; vcs_scr_writew(vc, (vcs_scr_readw(vc, org) & 0xff00) | c, org); org++; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } return org; } /* * Compilers (gcc 10) are unable to optimize the swap in cpu_to_le16. So do it * the poor man way. */ static inline u16 vc_compile_le16(u8 hi, u8 lo) { #ifdef __BIG_ENDIAN return (lo << 8u) | hi; #else return (hi << 8u) | lo; #endif } static u16 *vcs_write_buf(struct vc_data *vc, const char *con_buf, unsigned int pos, unsigned int count, bool viewed, u16 **org0) { u16 *org; unsigned int col, maxcol = vc->vc_cols; unsigned char c; /* header */ if (pos < HEADER_SIZE) { char header[HEADER_SIZE]; getconsxy(vc, header + 2); while (pos < HEADER_SIZE && count > 0) { count--; header[pos++] = *con_buf++; } if (!viewed) putconsxy(vc, header + 2); } if (!count) return NULL; pos -= HEADER_SIZE; col = (pos/2) % maxcol; *org0 = org = screen_pos(vc, pos/2, viewed); /* odd pos -- the first single character */ if (pos & 1) { count--; c = *con_buf++; vcs_scr_writew(vc, vc_compile_le16(c, vcs_scr_readw(vc, org)), org); org++; pos++; if (++col == maxcol) { org = screen_pos(vc, pos/2, viewed); col = 0; } } pos /= 2; pos += maxcol - col; /* even pos -- handle attr+character pairs */ while (count > 1) { unsigned short w; w = get_unaligned(((unsigned short *)con_buf)); vcs_scr_writew(vc, w, org++); con_buf += 2; count -= 2; if (++col == maxcol) { org = screen_pos(vc, pos, viewed); col = 0; pos += maxcol; } } if (!count) return org; /* odd pos -- the remaining character */ c = *con_buf++; vcs_scr_writew(vc, vc_compile_le16(vcs_scr_readw(vc, org) >> 8, c), org); return org; } static ssize_t vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct vc_data *vc; u16 *org0, *org; unsigned int written; int size; ssize_t ret; loff_t pos; bool viewed, attr; if (use_unicode(inode)) return -EOPNOTSUPP; char *con_buf __free(free_page_ptr) = (char *)__get_free_page(GFP_KERNEL); if (!con_buf) return -ENOMEM; pos = *ppos; /* Select the proper current console and verify * sanity of the situation under the console lock. */ guard(console_lock)(); attr = use_attributes(inode); vc = vcs_vc(inode, &viewed); if (!vc) return -ENXIO; size = vcs_size(vc, attr, false); if (size < 0) return size; if (pos < 0 || pos > size) return -EINVAL; if (count > size - pos) count = size - pos; written = 0; while (count) { unsigned int this_round = count; if (this_round > CON_BUF_SIZE) this_round = CON_BUF_SIZE; /* Temporarily drop the console lock so that we can read * in the write data from userspace safely. */ console_unlock(); ret = copy_from_user(con_buf, buf, this_round); console_lock(); if (ret) { this_round -= ret; if (!this_round) { /* Abort loop if no data were copied. Otherwise * fail with -EFAULT. */ if (written) break; return -EFAULT; } } /* The vc might have been freed or vcs_size might have changed * while we slept to grab the user buffer, so recheck. * Return data written up to now on failure. */ vc = vcs_vc(inode, &viewed); if (!vc) { if (written) break; return -ENXIO; } size = vcs_size(vc, attr, false); if (size < 0) { if (written) break; return size; } if (pos >= size) break; if (this_round > size - pos) this_round = size - pos; /* OK, now actually push the write to the console * under the lock using the local kernel buffer. */ if (attr) org = vcs_write_buf(vc, con_buf, pos, this_round, viewed, &org0); else org = vcs_write_buf_noattr(vc, con_buf, pos, this_round, viewed, &org0); count -= this_round; written += this_round; buf += this_round; pos += this_round; if (org) update_region(vc, (unsigned long)(org0), org - org0); } *ppos += written; ret = written; if (written) vcs_scr_updated(vc); return ret; } static __poll_t vcs_poll(struct file *file, poll_table *wait) { struct vcs_poll_data *poll = vcs_poll_data_get(file); __poll_t ret = DEFAULT_POLLMASK|EPOLLERR; if (poll) { poll_wait(file, &poll->waitq, wait); switch (poll->event) { case VT_UPDATE: ret = DEFAULT_POLLMASK|EPOLLPRI; break; case VT_DEALLOCATE: ret = DEFAULT_POLLMASK|EPOLLHUP|EPOLLERR; break; case 0: ret = DEFAULT_POLLMASK; break; } } return ret; } static int vcs_fasync(int fd, struct file *file, int on) { struct vcs_poll_data *poll = file->private_data; if (!poll) { /* don't allocate anything if all we want is disable fasync */ if (!on) return 0; poll = vcs_poll_data_get(file); if (!poll) return -ENOMEM; } return fasync_helper(fd, file, on, &poll->fasync); } static int vcs_open(struct inode *inode, struct file *filp) { unsigned int currcons = console(inode); bool attr = use_attributes(inode); bool uni_mode = use_unicode(inode); /* we currently don't support attributes in unicode mode */ if (attr && uni_mode) return -EOPNOTSUPP; guard(console_lock)(); if (currcons && !vc_cons_allocated(currcons - 1)) return -ENXIO; return 0; } static int vcs_release(struct inode *inode, struct file *file) { struct vcs_poll_data *poll = file->private_data; if (poll) vcs_poll_data_free(poll); return 0; } static const struct file_operations vcs_fops = { .llseek = vcs_lseek, .read = vcs_read, .write = vcs_write, .poll = vcs_poll, .fasync = vcs_fasync, .open = vcs_open, .release = vcs_release, }; static const struct class vc_class = { .name = "vc", }; void vcs_make_sysfs(int index) { device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, index + 1), NULL, "vcs%u", index + 1); device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, index + 65), NULL, "vcsu%u", index + 1); device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, index + 129), NULL, "vcsa%u", index + 1); } void vcs_remove_sysfs(int index) { device_destroy(&vc_class, MKDEV(VCS_MAJOR, index + 1)); device_destroy(&vc_class, MKDEV(VCS_MAJOR, index + 65)); device_destroy(&vc_class, MKDEV(VCS_MAJOR, index + 129)); } int __init vcs_init(void) { unsigned int i; if (register_chrdev(VCS_MAJOR, "vcs", &vcs_fops)) panic("unable to get major %d for vcs device", VCS_MAJOR); if (class_register(&vc_class)) panic("unable to create vc_class"); device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, 0), NULL, "vcs"); device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, 64), NULL, "vcsu"); device_create(&vc_class, NULL, MKDEV(VCS_MAJOR, 128), NULL, "vcsa"); for (i = 0; i < MIN_NR_CONSOLES; i++) vcs_make_sysfs(i); return 0; } |
| 2928 2932 2931 1 2931 2925 2929 2929 2928 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/static_call.h> #include <linux/bug.h> #include <linux/smp.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/cpu.h> #include <linux/processor.h> #include <asm/sections.h> extern struct static_call_site __start_static_call_sites[], __stop_static_call_sites[]; extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; int static_call_initialized; /* * Must be called before early_initcall() to be effective. */ void static_call_force_reinit(void) { if (WARN_ON_ONCE(!static_call_initialized)) return; static_call_initialized++; } /* mutex to protect key modules/sites */ static DEFINE_MUTEX(static_call_mutex); static void static_call_lock(void) { mutex_lock(&static_call_mutex); } static void static_call_unlock(void) { mutex_unlock(&static_call_mutex); } static inline void *static_call_addr(struct static_call_site *site) { return (void *)((long)site->addr + (long)&site->addr); } static inline unsigned long __static_call_key(const struct static_call_site *site) { return (long)site->key + (long)&site->key; } static inline struct static_call_key *static_call_key(const struct static_call_site *site) { return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); } /* These assume the key is word-aligned. */ static inline bool static_call_is_init(struct static_call_site *site) { return __static_call_key(site) & STATIC_CALL_SITE_INIT; } static inline bool static_call_is_tail(struct static_call_site *site) { return __static_call_key(site) & STATIC_CALL_SITE_TAIL; } static inline void static_call_set_init(struct static_call_site *site) { site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - (long)&site->key; } static int static_call_site_cmp(const void *_a, const void *_b) { const struct static_call_site *a = _a; const struct static_call_site *b = _b; const struct static_call_key *key_a = static_call_key(a); const struct static_call_key *key_b = static_call_key(b); if (key_a < key_b) return -1; if (key_a > key_b) return 1; return 0; } static void static_call_site_swap(void *_a, void *_b, int size) { long delta = (unsigned long)_a - (unsigned long)_b; struct static_call_site *a = _a; struct static_call_site *b = _b; struct static_call_site tmp = *a; a->addr = b->addr - delta; a->key = b->key - delta; b->addr = tmp.addr + delta; b->key = tmp.key + delta; } static inline void static_call_sort_entries(struct static_call_site *start, struct static_call_site *stop) { sort(start, stop - start, sizeof(struct static_call_site), static_call_site_cmp, static_call_site_swap); } static inline bool static_call_key_has_mods(struct static_call_key *key) { return !(key->type & 1); } static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) { if (!static_call_key_has_mods(key)) return NULL; return key->mods; } static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) { if (static_call_key_has_mods(key)) return NULL; return (struct static_call_site *)(key->type & ~1); } void __static_call_update(struct static_call_key *key, void *tramp, void *func) { struct static_call_site *site, *stop; struct static_call_mod *site_mod, first; cpus_read_lock(); static_call_lock(); if (key->func == func) goto done; key->func = func; arch_static_call_transform(NULL, tramp, func, false); /* * If uninitialized, we'll not update the callsites, but they still * point to the trampoline and we just patched that. */ if (WARN_ON_ONCE(!static_call_initialized)) goto done; first = (struct static_call_mod){ .next = static_call_key_next(key), .mod = NULL, .sites = static_call_key_sites(key), }; for (site_mod = &first; site_mod; site_mod = site_mod->next) { bool init = system_state < SYSTEM_RUNNING; struct module *mod = site_mod->mod; if (!site_mod->sites) { /* * This can happen if the static call key is defined in * a module which doesn't use it. * * It also happens in the has_mods case, where the * 'first' entry has no sites associated with it. */ continue; } stop = __stop_static_call_sites; if (mod) { #ifdef CONFIG_MODULES stop = mod->static_call_sites + mod->num_static_call_sites; init = mod->state == MODULE_STATE_COMING; #endif } for (site = site_mod->sites; site < stop && static_call_key(site) == key; site++) { void *site_addr = static_call_addr(site); if (!init && static_call_is_init(site)) continue; if (!kernel_text_address((unsigned long)site_addr)) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!static_call_is_init(site), "can't patch static call site at %pS", site_addr); continue; } arch_static_call_transform(site_addr, tramp, func, static_call_is_tail(site)); } } done: static_call_unlock(); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(__static_call_update); static int __static_call_init(struct module *mod, struct static_call_site *start, struct static_call_site *stop) { struct static_call_site *site; struct static_call_key *key, *prev_key = NULL; struct static_call_mod *site_mod; if (start == stop) return 0; static_call_sort_entries(start, stop); for (site = start; site < stop; site++) { void *site_addr = static_call_addr(site); if ((mod && within_module_init((unsigned long)site_addr, mod)) || (!mod && init_section_contains(site_addr, 1))) static_call_set_init(site); key = static_call_key(site); if (key != prev_key) { prev_key = key; /* * For vmlinux (!mod) avoid the allocation by storing * the sites pointer in the key itself. Also see * __static_call_update()'s @first. * * This allows architectures (eg. x86) to call * static_call_init() before memory allocation works. */ if (!mod) { key->sites = site; key->type |= 1; goto do_transform; } site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); if (!site_mod) return -ENOMEM; /* * When the key has a direct sites pointer, extract * that into an explicit struct static_call_mod, so we * can have a list of modules. */ if (static_call_key_sites(key)) { site_mod->mod = NULL; site_mod->next = NULL; site_mod->sites = static_call_key_sites(key); key->mods = site_mod; site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); if (!site_mod) return -ENOMEM; } site_mod->mod = mod; site_mod->sites = site; site_mod->next = static_call_key_next(key); key->mods = site_mod; } do_transform: arch_static_call_transform(site_addr, NULL, key->func, static_call_is_tail(site)); } return 0; } static int addr_conflict(struct static_call_site *site, void *start, void *end) { unsigned long addr = (unsigned long)static_call_addr(site); if (addr <= (unsigned long)end && addr + CALL_INSN_SIZE > (unsigned long)start) return 1; return 0; } static int __static_call_text_reserved(struct static_call_site *iter_start, struct static_call_site *iter_stop, void *start, void *end, bool init) { struct static_call_site *iter = iter_start; while (iter < iter_stop) { if (init || !static_call_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifdef CONFIG_MODULES static int __static_call_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; scoped_guard(rcu) { mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; } if (!mod) return 0; ret = __static_call_text_reserved(mod->static_call_sites, mod->static_call_sites + mod->num_static_call_sites, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static unsigned long tramp_key_lookup(unsigned long addr) { struct static_call_tramp_key *start = __start_static_call_tramp_key; struct static_call_tramp_key *stop = __stop_static_call_tramp_key; struct static_call_tramp_key *tramp_key; for (tramp_key = start; tramp_key != stop; tramp_key++) { unsigned long tramp; tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; if (tramp == addr) return (long)tramp_key->key + (long)&tramp_key->key; } return 0; } static int static_call_add_module(struct module *mod) { struct static_call_site *start = mod->static_call_sites; struct static_call_site *stop = start + mod->num_static_call_sites; struct static_call_site *site; for (site = start; site != stop; site++) { unsigned long s_key = __static_call_key(site); unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; unsigned long key; /* * Is the key is exported, 'addr' points to the key, which * means modules are allowed to call static_call_update() on * it. * * Otherwise, the key isn't exported, and 'addr' points to the * trampoline so we need to lookup the key. * * We go through this dance to prevent crazy modules from * abusing sensitive static calls. */ if (!kernel_text_address(addr)) continue; key = tramp_key_lookup(addr); if (!key) { pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", static_call_addr(site)); return -EINVAL; } key |= s_key & STATIC_CALL_SITE_FLAGS; site->key = key - (long)&site->key; } return __static_call_init(mod, start, stop); } static void static_call_del_module(struct module *mod) { struct static_call_site *start = mod->static_call_sites; struct static_call_site *stop = mod->static_call_sites + mod->num_static_call_sites; struct static_call_key *key, *prev_key = NULL; struct static_call_mod *site_mod, **prev; struct static_call_site *site; for (site = start; site < stop; site++) { key = static_call_key(site); /* * If the key was not updated due to a memory allocation * failure in __static_call_init() then treating key::sites * as key::mods in the code below would cause random memory * access and #GP. In that case all subsequent sites have * not been touched either, so stop iterating. */ if (!static_call_key_has_mods(key)) break; if (key == prev_key) continue; prev_key = key; for (prev = &key->mods, site_mod = key->mods; site_mod && site_mod->mod != mod; prev = &site_mod->next, site_mod = site_mod->next) ; if (!site_mod) continue; *prev = site_mod->next; kfree(site_mod); } } static int static_call_module_notify(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); static_call_lock(); switch (val) { case MODULE_STATE_COMING: ret = static_call_add_module(mod); if (ret) { pr_warn("Failed to allocate memory for static calls\n"); static_call_del_module(mod); } break; case MODULE_STATE_GOING: static_call_del_module(mod); break; } static_call_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block static_call_module_nb = { .notifier_call = static_call_module_notify, }; #else static inline int __static_call_mod_text_reserved(void *start, void *end) { return 0; } #endif /* CONFIG_MODULES */ int static_call_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __static_call_text_reserved(__start_static_call_sites, __stop_static_call_sites, start, end, init); if (ret) return ret; return __static_call_mod_text_reserved(start, end); } int __init static_call_init(void) { int ret; /* See static_call_force_reinit(). */ if (static_call_initialized == 1) return 0; cpus_read_lock(); static_call_lock(); ret = __static_call_init(NULL, __start_static_call_sites, __stop_static_call_sites); static_call_unlock(); cpus_read_unlock(); if (ret) { pr_err("Failed to allocate memory for static_call!\n"); BUG(); } #ifdef CONFIG_MODULES if (!static_call_initialized) register_module_notifier(&static_call_module_nb); #endif static_call_initialized = 1; return 0; } early_initcall(static_call_init); #ifdef CONFIG_STATIC_CALL_SELFTEST static int func_a(int x) { return x+1; } static int func_b(int x) { return x+2; } DEFINE_STATIC_CALL(sc_selftest, func_a); static struct static_call_data { int (*func)(int); int val; int expect; } static_call_data [] __initdata = { { NULL, 2, 3 }, { func_b, 2, 4 }, { func_a, 2, 3 } }; static int __init test_static_call_init(void) { int i; for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { struct static_call_data *scd = &static_call_data[i]; if (scd->func) static_call_update(sc_selftest, scd->func); WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); } return 0; } early_initcall(test_static_call_init); #endif /* CONFIG_STATIC_CALL_SELFTEST */ |
| 36 13 3 47 1 3 19 9 8 2 8 10 22 22 10 19 4 15 6 17 6 14 2 17 6 2 4 4 4 4 4 4 2 109 1 109 5 104 102 44 58 8 49 49 50 50 37 13 8 8 571 571 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: net/ipv4/ip_fragment.c */ /* * Fixes: * Andi Kleen Make it work with multiple hosts. * More RFC compliance. * * Horst von Brand Add missing #include <linux/string.h> * Alexey Kuznetsov SMP races, threading, cleanup. * Patrick McHardy LRU queue of frag heads for evictor. * Mitsuru KANDA @USAGI Register inet6_protocol{}. * David Stevens and * YOSHIFUJI,H. @USAGI Always remove fragment header to * calculate ICV correctly. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/tcp.h> #include <linux/udp.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ipv6_frag.h> #include <net/inet_ecn.h> static const char ip6_frag_cache_name[] = "ip6-frags"; static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs); static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = IP6_DEFRAG_LOCAL_DELIVER, .iif = iif, }; struct inet_frag_queue *q; if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL))) key.iif = 0; q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int ip6_frag_queue(struct net *net, struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset, int *refs) { int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (fq->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); /* note that if prob_offset is set, the skb is freed elsewhere, * we do not free it here. */ return -1; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ *prob_offset = offsetof(struct ipv6hdr, payload_len); return -1; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto discard_fq; fq->q.len = end; } } if (end == offset) goto discard_fq; err = -ENOMEM; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) goto discard_fq; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_fq; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto insert_error; if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; fq->q.meat += skb->len; fq->ecn |= ecn; add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) fq->q.max_size = fragsize; /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto out_fail; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; payload_len = -skb_network_offset(skb) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr); if (payload_len > IPV6_MAXPLEN) goto out_oversize; /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); if (skb_mac_header_was_set(skb)) skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, true); skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->nhoff = nhoff; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oversize: net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len); goto out_fail; out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); inet_frag_kill(&fq->q, refs); return -1; } static int ipv6_frag_rcv(struct sk_buff *skb) { const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = skb_dst_dev_net(skb); struct frag_hdr *fhdr; struct frag_queue *fq; u8 nexthdr; int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len == 0) goto fail_hdr; if (!pskb_may_pull(skb, (skb_transport_offset(skb) + sizeof(struct frag_hdr)))) goto fail_hdr; hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr); return 1; } /* RFC 8200, Section 4.5 Fragment Header: * If the first fragment does not include all headers through an * Upper-Layer header, then that fragment should be discarded and * an ICMP Parameter Problem, Code 3, message should be sent to * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); return -1; } iif = skb->dev ? skb->dev->ifindex : 0; rcu_read_lock(); fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; int ret, refs = 0; spin_lock(&fq->q.lock); fq->iif = iif; ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff, &prob_offset, &refs); spin_unlock(&fq->q.lock); rcu_read_unlock(); inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); } return ret; } rcu_read_unlock(); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; fail_hdr: __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } static const struct inet6_protocol frag_protocol = { .handler = ipv6_frag_rcv, .flags = INET6_PROTO_NOPOLICY, }; #ifdef CONFIG_SYSCTL static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ip6frag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* secret interval has been deprecated */ static int ip6_frags_secret_interval_unused; static struct ctl_table ip6_frags_ctl_table[] = { { .procname = "ip6frag_secret_interval", .data = &ip6_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip6_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv6.fqdir->high_thresh; table[0].extra1 = &net->ipv6.fqdir->low_thresh; table[1].data = &net->ipv6.fqdir->low_thresh; table[1].extra2 = &net->ipv6.fqdir->high_thresh; table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(ip6_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *ip6_ctl_header; static int ip6_frags_sysctl_register(void) { ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6", ip6_frags_ctl_table); return ip6_ctl_header == NULL ? -ENOMEM : 0; } static void ip6_frags_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_ctl_header); } #else static int ip6_frags_ns_sysctl_register(struct net *net) { return 0; } static void ip6_frags_ns_sysctl_unregister(struct net *net) { } static int ip6_frags_sysctl_register(void) { return 0; } static void ip6_frags_sysctl_unregister(void) { } #endif static int __net_init ipv6_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; res = ip6_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(net->ipv6.fqdir); return res; } static void __net_exit ipv6_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv6.fqdir); } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .pre_exit = ipv6_frags_pre_exit_net, .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int __init ipv6_frag_init(void) { int ret; ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); if (ret) goto err_protocol; ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&ip6_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); err_protocol: inet_frags_fini(&ip6_frags); goto out; } void ipv6_frag_exit(void) { ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); inet_frags_fini(&ip6_frags); } |
| 66 11 716 736 44 1 19984 5 6 4933 8 8 3 3 5 5 5164 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> #include <linux/sched/coredump.h> /* * Routines for handling mm_structs */ extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. * @mm: The &struct mm_struct to pin. * * Make sure that @mm will not get freed even after the owning task * exits. This doesn't guarantee that the associated address space * will still exist later on and mmget_not_zero() has to be used before * accessing it. * * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) { atomic_inc(&mm->mm_count); } static inline void smp_mb__after_mmgrab(void) { smp_mb__after_atomic(); } extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) { /* * The implicit full barrier implied by atomic_dec_and_test() is * required by the membarrier system call before returning to * user-space, after storing to rq->curr. */ if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } #ifdef CONFIG_PREEMPT_RT /* * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is * by far the least expensive way to do that. */ static inline void __mmdrop_delayed(struct rcu_head *rhp) { struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); __mmdrop(mm); } /* * Invoked from finish_task_switch(). Delegates the heavy lifting on RT * kernels via RCU. */ static inline void mmdrop_sched(struct mm_struct *mm) { /* Provides a full memory barrier. See mmdrop() */ if (atomic_dec_and_test(&mm->mm_count)) call_rcu(&mm->delayed_drop, __mmdrop_delayed); } #else static inline void mmdrop_sched(struct mm_struct *mm) { mmdrop(mm); } #endif /* Helpers for lazy TLB mm refcounting */ static inline void mmgrab_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmgrab(mm); } static inline void mmdrop_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { mmdrop(mm); } else { /* * mmdrop_lazy_tlb must provide a full memory barrier, see the * membarrier comment finish_task_switch which relies on this. */ smp_mb(); } } static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmdrop_sched(mm); else smp_mb(); /* see mmdrop_lazy_tlb() above */ } /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. * * Make sure that the address space of the given &struct mm_struct doesn't * go away. This does not protect against parts of the address space being * modified or freed, however. * * Never use this function to pin this address space for an * unbounded/indefinite amount of time. * * Use mmput() to release the reference acquired by mmget(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) { atomic_inc(&mm->mm_users); } static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); } /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); #if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ void mmput_async(struct mm_struct *); #endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* * Grab a reference to a task's mm, if it is not already going away * and ptrace_may_access with the mode parameter passed to it * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct on exit() */ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU #ifndef arch_get_mmap_end #define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) #endif #ifndef arch_get_mmap_base #define arch_get_mmap_base(addr, base) (base) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) { bool ret; /* * need RCU to access ->real_parent if CLONE_VM was used along with * CLONE_PARENT. * * We check real_parent->mm == tsk->mm because CLONE_VFORK does not * imply CLONE_VM * * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus * ->real_parent is not necessarily the task doing vfork(), so in * theory we can't rely on task_lock() if we want to dereference it. * * And in this case we can't trust the real_parent->mm == tsk->mm * check, it can be false negative. But we do not care, if init or * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); ret = tsk->vfork_done && rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; } /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence */ if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } return flags; } #ifdef CONFIG_LOCKDEP extern void __fs_reclaim_acquire(unsigned long ip); extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else static inline void __fs_reclaim_acquire(unsigned long ip) { } static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif /* Any memory-allocation retry loop should use * memalloc_retry_wait(), and pass the flags for the most * constrained allocation attempt that might have failed. * This provides useful documentation of where loops are, * and a central place to fine tune the waiting as the MM * implementation changes. */ static inline void memalloc_retry_wait(gfp_t gfp_flags) { /* We use io_schedule_timeout because waiting for memory * typically included waiting for dirty pages to be * written out, which requires IO. */ __set_current_state(TASK_UNINTERRUPTIBLE); gfp_flags = current_gfp_context(gfp_flags); if (gfpflags_allow_blocking(gfp_flags) && !(gfp_flags & __GFP_NORETRY)) /* Probably waited already, no need for much more */ io_schedule_timeout(1); else /* Probably didn't wait, and has now released a lock, * so now is a good time to wait */ io_schedule_timeout(HZ/50); } /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate * * Similar to might_sleep() and other annotations, this can be used in functions * that might allocate, but often don't. Compiles to nothing without * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. */ static inline void might_alloc(gfp_t gfp_mask) { fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). */ static inline unsigned memalloc_flags_save(unsigned flags) { unsigned oldflags = ~current->flags & flags; current->flags |= flags; return oldflags; } static inline void memalloc_flags_restore(unsigned flags) { current->flags &= ~flags; } /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. * All further allocations will implicitly drop __GFP_IO flag and so * they are safe for the IO critical section from the allocation recursion * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. * * This functions marks the beginning of the GFP_NOFS allocation scope. * All further allocations will implicitly drop __GFP_FS flag and so * they are safe for the FS critical section from the allocation recursion * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. * * This function marks the beginning of the __GFP_MEMALLOC allocation scope. * All further allocations will implicitly add the __GFP_MEMALLOC flag, which * prevents entering reclaim and allows access to all memory reserves. This * should only be used when the caller guarantees the allocation will allow more * memory to be freed very shortly, i.e. it needs to allocate some memory in * the process of freeing memory, and cannot reclaim due to potential recursion. * * Users of this scope have to be extremely careful to not deplete the reserves * completely and implement a throttling mechanism which controls the * consumption of the reserve based on the amount of freed memory. Usage of a * pre-allocated pool (e.g. mempool) should be always considered before using * this scope. * * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC * * Context: This function should not be used in an interrupt context as that one * does not give PF_MEMALLOC access to reserves. * See __gfp_pfmemalloc_flags(). * Return: The saved flags to be passed to memalloc_noreclaim_restore. */ static inline unsigned int memalloc_noreclaim_save(void) { return memalloc_flags_save(PF_MEMALLOC); } /** * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. * @flags: Flags to restore. * * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save * function. Always make sure that the given flags is the return value from the * pairing memalloc_noreclaim_save call. */ static inline void memalloc_noreclaim_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. * * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. * All further allocations will implicitly remove the __GFP_MOVABLE flag, which * will constraint the allocations to zones that allow long term pinning, i.e. * not ZONE_MOVABLE zones. * * Return: The saved flags to be passed to memalloc_pin_restore. */ static inline unsigned int memalloc_pin_save(void) { return memalloc_flags_save(PF_MEMALLOC_PIN); } /** * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. * @flags: Flags to restore. * * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. * Always make sure that the given flags is the return value from the pairing * memalloc_pin_save call. */ static inline void memalloc_pin_restore(unsigned int flags) { memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * * Please, make sure that caller has a reference to the passed memcg structure, * so its lifetime is guaranteed to exceed the scope between two * set_active_memcg() calls. * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { old = current->active_memcg; current->active_memcg = memcg; } return old; } #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } #endif #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #include <asm/membarrier.h> #endif static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { /* * The atomic_read() below prevents CSE. The following should * help the compiler generate more efficient code on architectures * where sync_core_before_usermode() is a no-op. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; sync_core_before_usermode(); } extern void membarrier_exec_mmap(struct mm_struct *mm); extern void membarrier_update_current_mm(struct mm_struct *next_mm); #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { } #endif static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } static inline void membarrier_update_current_mm(struct mm_struct *next_mm) { } #endif #endif /* _LINUX_SCHED_MM_H */ |
| 26 1 24 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Florian Westphal <fw@strlen.de> * * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/ip.h> #include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> #include <linux/netfilter/xt_rpfilter.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match"); /* don't try to find route from mcast/bcast/zeronet */ static __be32 rpfilter_get_saddr(__be32 addr) { if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || ipv4_is_zeronet(addr)) return 0; return addr; } static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL)) return false; } return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info; const struct iphdr *iph; struct flowi4 flow; bool invert; info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ip_hdr(skb); if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) return true ^ invert; } memset(&flow, 0, sizeof(flow)); flow.flowi4_iif = LOOPBACK_IFINDEX; flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) { const struct xt_rpfilter_info *info = par->matchinfo; unsigned int options = ~XT_RPFILTER_OPTION_MASK; if (info->flags & options) { pr_info_ratelimited("unknown options\n"); return -EINVAL; } if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "raw") != 0) { pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n", par->table); return -EINVAL; } return 0; } static struct xt_match rpfilter_mt_reg __read_mostly = { .name = "rpfilter", .family = NFPROTO_IPV4, .checkentry = rpfilter_check, .match = rpfilter_mt, .matchsize = sizeof(struct xt_rpfilter_info), .hooks = (1 << NF_INET_PRE_ROUTING), .me = THIS_MODULE }; static int __init rpfilter_mt_init(void) { return xt_register_match(&rpfilter_mt_reg); } static void __exit rpfilter_mt_exit(void) { xt_unregister_match(&rpfilter_mt_reg); } module_init(rpfilter_mt_init); module_exit(rpfilter_mt_exit); |
| 68 35 55 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | #ifndef LLC_PDU_H #define LLC_PDU_H /* * Copyright (c) 1997 by Procom Technology,Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if_ether.h> /* Lengths of frame formats */ #define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ #define LLC_PDU_LEN_S 4 #define LLC_PDU_LEN_U 3 /* header and 1 control byte */ /* header and 1 control byte and XID info */ #define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ #define LLC_MGMT_INDIV 0x02 /* station LLC mgmt indiv addr */ #define LLC_MGMT_GRP 0x03 /* station LLC mgmt group addr */ #define LLC_RDE_SAP 0xA6 /* route ... */ /* SAP field bit masks */ #define LLC_ISO_RESERVED_SAP 0x02 #define LLC_SAP_GROUP_DSAP 0x01 #define LLC_SAP_RESP_SSAP 0x01 /* Group/individual DSAP indicator is DSAP field */ #define LLC_PDU_GROUP_DSAP_MASK 0x01 #define LLC_PDU_IS_GROUP_DSAP(pdu) \ ((pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) #define LLC_PDU_IS_INDIV_DSAP(pdu) \ (!(pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) /* Command/response PDU indicator in SSAP field */ #define LLC_PDU_CMD_RSP_MASK 0x01 #define LLC_PDU_CMD 0 #define LLC_PDU_RSP 1 #define LLC_PDU_IS_CMD(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 0 : 1) #define LLC_PDU_IS_RSP(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 1 : 0) /* Get PDU type from 2 lowest-order bits of control field first byte */ #define LLC_PDU_TYPE_I_MASK 0x01 /* 16-bit control field */ #define LLC_PDU_TYPE_S_MASK 0x03 #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 #define LLC_PDU_TYPE_I 0 /* first bit */ #define LLC_PDU_TYPE_S 1 /* first two bits */ #define LLC_PDU_TYPE_U 3 /* first two bits */ #define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) #define LLC_PDU_TYPE_IS_U(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_U_MASK) == LLC_PDU_TYPE_U) ? 1 : 0) #define LLC_PDU_TYPE_IS_S(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_S_MASK) == LLC_PDU_TYPE_S) ? 1 : 0) /* U-format PDU control field masks */ #define LLC_U_PF_BIT_MASK 0x10 /* P/F bit mask */ #define LLC_U_PF_IS_1(pdu) ((pdu->ctrl_1 & LLC_U_PF_BIT_MASK) ? 1 : 0) #define LLC_U_PF_IS_0(pdu) ((!(pdu->ctrl_1 & LLC_U_PF_BIT_MASK)) ? 1 : 0) #define LLC_U_PDU_CMD_MASK 0xEC /* cmd/rsp mask */ #define LLC_U_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_U_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_1_PDU_CMD_UI 0x00 /* Type 1 cmds/rsps */ #define LLC_1_PDU_CMD_XID 0xAC #define LLC_1_PDU_CMD_TEST 0xE0 #define LLC_2_PDU_CMD_SABME 0x6C /* Type 2 cmds/rsps */ #define LLC_2_PDU_CMD_DISC 0x40 #define LLC_2_PDU_RSP_UA 0x60 #define LLC_2_PDU_RSP_DM 0x0C #define LLC_2_PDU_RSP_FRMR 0x84 /* Type 1 operations */ /* XID information field bit masks */ /* LLC format identifier (byte 1) */ #define LLC_XID_FMT_ID 0x81 /* first byte must be this */ /* LLC types/classes identifier (byte 2) */ #define LLC_XID_CLASS_ZEROS_MASK 0xE0 /* these must be zeros */ #define LLC_XID_CLASS_MASK 0x1F /* AND with byte to get below */ #define LLC_XID_NULL_CLASS_1 0x01 /* if NULL LSAP...use these */ #define LLC_XID_NULL_CLASS_2 0x03 #define LLC_XID_NULL_CLASS_3 0x05 #define LLC_XID_NULL_CLASS_4 0x07 #define LLC_XID_NNULL_TYPE_1 0x01 /* if non-NULL LSAP...use these */ #define LLC_XID_NNULL_TYPE_2 0x02 #define LLC_XID_NNULL_TYPE_3 0x04 #define LLC_XID_NNULL_TYPE_1_2 0x03 #define LLC_XID_NNULL_TYPE_1_3 0x05 #define LLC_XID_NNULL_TYPE_2_3 0x06 #define LLC_XID_NNULL_ALL 0x07 /* Sender Receive Window (byte 3) */ #define LLC_XID_RW_MASK 0xFE /* AND with value to get below */ #define LLC_XID_MIN_RW 0x02 /* lowest-order bit always zero */ /* Type 2 operations */ #define LLC_2_SEQ_NBR_MODULO ((u8) 128) /* I-PDU masks ('ctrl' is I-PDU control word) */ #define LLC_I_GET_NS(pdu) (u8)((pdu->ctrl_1 & 0xFE) >> 1) #define LLC_I_GET_NR(pdu) (u8)((pdu->ctrl_2 & 0xFE) >> 1) #define LLC_I_PF_BIT_MASK 0x01 #define LLC_I_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_I_PF_BIT_MASK)) ? 1 : 0) #define LLC_I_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_I_PF_BIT_MASK) ? 1 : 0) /* S-PDU supervisory commands and responses */ #define LLC_S_PDU_CMD_MASK 0x0C #define LLC_S_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_S_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_2_PDU_CMD_RR 0x00 /* rx ready cmd */ #define LLC_2_PDU_RSP_RR 0x00 /* rx ready rsp */ #define LLC_2_PDU_CMD_REJ 0x08 /* reject PDU cmd */ #define LLC_2_PDU_RSP_REJ 0x08 /* reject PDU rsp */ #define LLC_2_PDU_CMD_RNR 0x04 /* rx not ready cmd */ #define LLC_2_PDU_RSP_RNR 0x04 /* rx not ready rsp */ #define LLC_S_PF_BIT_MASK 0x01 #define LLC_S_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_S_PF_BIT_MASK)) ? 1 : 0) #define LLC_S_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_S_PF_BIT_MASK) ? 1 : 0) #define PDU_SUPV_GET_Nr(pdu) ((pdu->ctrl_2 & 0xFE) >> 1) #define PDU_GET_NEXT_Vr(sn) (((sn) + 1) & ~LLC_2_SEQ_NBR_MODULO) /* FRMR information field macros */ #define FRMR_INFO_LENGTH 5 /* 5 bytes of information */ /* * info is pointer to FRMR info field structure; 'rej_ctrl' is byte pointer * (if U-PDU) or word pointer to rejected PDU control field */ #define FRMR_INFO_SET_REJ_CNTRL(info,rej_ctrl) \ info->rej_pdu_ctrl = ((*((u8 *) rej_ctrl) & \ LLC_PDU_TYPE_U) != LLC_PDU_TYPE_U ? \ (u16)*((u16 *) rej_ctrl) : \ (((u16) *((u8 *) rej_ctrl)) & 0x00FF)) /* * Info is pointer to FRMR info field structure; 'vs' is a byte containing * send state variable value in low-order 7 bits (insure the lowest-order * bit remains zero (0)) */ #define FRMR_INFO_SET_Vs(info,vs) (info->curr_ssv = (((u8) vs) << 1)) #define FRMR_INFO_SET_Vr(info,vr) (info->curr_rsv = (((u8) vr) << 1)) /* * Info is pointer to FRMR info field structure; 'cr' is a byte containing * the C/R bit value in the low-order bit */ #define FRMR_INFO_SET_C_R_BIT(info, cr) (info->curr_rsv |= (((u8) cr) & 0x01)) /* * In the remaining five macros, 'info' is pointer to FRMR info field * structure; 'ind' is a byte containing the bit value to set in the * lowest-order bit) */ #define FRMR_INFO_SET_INVALID_PDU_CTRL_IND(info, ind) \ (info->ind_bits = ((info->ind_bits & 0xFE) | (((u8) ind) & 0x01))) #define FRMR_INFO_SET_INVALID_PDU_INFO_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFD) | (((u8) ind) & 0x02))) #define FRMR_INFO_SET_PDU_INFO_2LONG_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFB) | (((u8) ind) & 0x04))) #define FRMR_INFO_SET_PDU_INVALID_Nr_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xF7) | (((u8) ind) & 0x08))) #define FRMR_INFO_SET_PDU_INVALID_Ns_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xEF) | (((u8) ind) & 0x10))) /* Sequence-numbered PDU format (4 bytes in length) */ struct llc_pdu_sn { u8 dsap; u8 ssap; u8 ctrl_1; u8 ctrl_2; } __packed; static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb) { return (struct llc_pdu_sn *)skb_network_header(skb); } /* Un-numbered PDU format (3 bytes in length) */ struct llc_pdu_un { u8 dsap; u8 ssap; u8 ctrl_1; } __packed; static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) { return (struct llc_pdu_un *)skb_network_header(skb); } /** * llc_pdu_header_init - initializes pdu header * @skb: input skb that header must be set into it. * @type: type of PDU (U, I or S). * @ssap: source sap. * @dsap: destination sap. * @cr: command/response bit (0 or 1). * * This function sets DSAP, SSAP and command/Response bit in LLC header. */ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; switch (type) { case LLC_PDU_TYPE_U: hlen = 3; break; case LLC_PDU_TYPE_U_XID: hlen = 6; break; } skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); pdu->dsap = dsap; pdu->ssap = ssap; pdu->ssap |= cr; } /** * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * * This function extracts source address(MAC) of input frame. */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** * llc_pdu_decode_ssap - extracts source SAP of input frame * @skb: input skb that source SAP must be extracted from it. * @ssap: source SAP (output argument). * * This function extracts source SAP of input frame. Right bit of SSAP is * command/response bit. */ static inline void llc_pdu_decode_ssap(struct sk_buff *skb, u8 *ssap) { *ssap = llc_pdu_un_hdr(skb)->ssap & 0xFE; } /** * llc_pdu_decode_dsap - extracts dest SAP of input frame * @skb: input skb that destination SAP must be extracted from it. * @dsap: destination SAP (output argument). * * This function extracts destination SAP of input frame. right bit of * DSAP designates individual/group SAP. */ static inline void llc_pdu_decode_dsap(struct sk_buff *skb, u8 *dsap) { *dsap = llc_pdu_un_hdr(skb)->dsap & 0xFE; } /** * llc_pdu_init_as_ui_cmd - sets LLC header as UI PDU * @skb: input skb that header must be set into it. * * This function sets third byte of LLC header as a UI PDU. */ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_UI; } /** * llc_pdu_init_as_test_cmd - sets PDU as TEST * @skb: Address of the skb to build * * Sets a PDU as TEST */ static inline void llc_pdu_init_as_test_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_test_rsp - build TEST response PDU * @skb: Address of the skb to build * @ev_skb: The received TEST command PDU frame * * Builds a pdu frame as a TEST response. */ static inline void llc_pdu_init_as_test_rsp(struct sk_buff *skb, struct sk_buff *ev_skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; if (ev_skb->protocol == htons(ETH_P_802_2)) { struct llc_pdu_un *ev_pdu = llc_pdu_un_hdr(ev_skb); int dsize; dsize = ntohs(eth_hdr(ev_skb)->h_proto) - 3; memcpy(((u8 *)pdu) + 3, ((u8 *)ev_pdu) + 3, dsize); skb_put(skb, dsize); } } /* LLC Type 1 XID command/response information fields format */ struct llc_xid_info { u8 fmt_id; /* always 0x81 for LLC */ u8 type; /* different if NULL/non-NULL LSAP */ u8 rw; /* sender receive window */ } __packed; /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. */ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ /* no need to push/put since llc_pdu_header_init() has already * pushed 3 + 3 bytes */ } /** * llc_pdu_init_as_xid_rsp - builds XID response PDU * @skb: Address of the skb to build * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * Builds a pdu frame as an XID response. */ static inline void llc_pdu_init_as_xid_rsp(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; skb_put(skb, sizeof(struct llc_xid_info)); } /* LLC Type 2 FRMR response information field format */ struct llc_frmr_info { u16 rej_pdu_ctrl; /* bits 1-8 if U-PDU */ u8 curr_ssv; /* current send state variable val */ u8 curr_rsv; /* current receive state variable */ u8 ind_bits; /* indicator bits set with macro */ } __packed; void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type); void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value); void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit); void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr); void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit); void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw); void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit); #endif /* LLC_PDU_H */ |
| 88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ struct inotify_event_info { struct fsnotify_event fse; u32 mask; int wd; u32 sync_cookie; int name_len; char name[]; }; struct inotify_inode_mark { struct fsnotify_mark fsn_mark; int wd; }; static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct inotify_event_info, fse); } /* * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is * used only internally to the kernel. */ #define INOTIFY_USER_MASK (IN_ALL_EVENTS) static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark) { __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK; if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) mask |= IN_EXCL_UNLINK; if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT) mask |= IN_ONESHOT; return mask; } extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); } static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) { return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); } static inline void dec_inotify_watches(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); } #endif |
| 571 571 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 | // SPDX-License-Identifier: GPL-2.0-or-later /* 6LoWPAN fragment reassembly * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "6LoWPAN: " fmt #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ieee802154_netdev.h> #include <net/6lowpan.h> #include <net/ipv6_frag.h> #include <net/inet_frag.h> #include <net/ip.h> #include "6lowpan_i.h" static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, struct net_device *ldev, int *refs); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { const struct frag_lowpan_compare_key *key = a; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; int refs = 1; fq = container_of(frag, struct frag_queue, q); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q, &refs); out: spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); } static inline struct lowpan_frag_queue * fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); struct frag_lowpan_compare_key key = {}; struct inet_frag_queue *q; key.tag = cb->d_tag; key.d_size = cb->d_size; key.src = *src; key.dst = *dst; q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; return container_of(q, struct lowpan_frag_queue, q); } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, struct sk_buff *skb, u8 frag_type, int *refs) { struct sk_buff *prev_tail; struct net_device *ldev; int end, offset, err; /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb * in inet_fragment.c */ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm)); BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm)); if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = lowpan_802154_cb(skb)->d_offset << 3; end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } } ldev = skb->dev; if (ldev) skb->dev = NULL; barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto err; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs); skb->_skb_refdst = orefdst; return res; } skb_dst_drop(skb); return -1; err: kfree_skb(skb); return -1; } /* Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *ldev, int *refs) { void *reasm_data; inet_frag_kill(&fq->q, refs); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oom: net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n"); return -1; } static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) { switch (res) { case RX_QUEUED: return NET_RX_SUCCESS; case RX_CONTINUE: /* nobody cared about this packet */ net_warn_ratelimited("%s: received unknown dispatch\n", __func__); fallthrough; default: /* all others failure */ return NET_RX_DROP; } } static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) { int ret; if (!lowpan_is_iphc(*skb_network_header(skb))) return RX_CONTINUE; ret = lowpan_iphc_decompress(skb); if (ret < 0) return RX_DROP; return RX_QUEUED; } static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) { lowpan_rx_result res; #define CALL_RXH(rxh) \ do { \ res = rxh(skb); \ if (res != RX_CONTINUE) \ goto rxh_next; \ } while (0) /* likely at first */ CALL_RXH(lowpan_frag_rx_h_iphc); CALL_RXH(lowpan_rx_h_ipv6); rxh_next: return lowpan_frag_rx_handlers_result(skb, res); #undef CALL_RXH } #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, struct lowpan_802154_cb *cb) { bool fail; u8 high = 0, low = 0; __be16 d_tag = 0; fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); /* remove the dispatch value and use first three bits as high value * for the datagram size */ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); cb->d_offset = 0; /* check if datagram_size has ipv6hdr on FRAG1 */ fail |= cb->d_size < sizeof(struct ipv6hdr); /* check if we can dereference the dispatch value */ fail |= !skb->len; } if (unlikely(fail)) return -EIO; return 0; } int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); struct ieee802154_hdr hdr = {}; int err; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) goto err; err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; if (frag_type == LOWPAN_DISPATCH_FRAG1) { err = lowpan_invoke_frag_rx_handlers(skb); if (err == NET_RX_DROP) goto err; } if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } rcu_read_lock(); fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret, refs = 0; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type, &refs); spin_unlock(&fq->q.lock); rcu_read_unlock(); inet_frag_putn(&fq->q, refs); return ret; } rcu_read_unlock(); err: kfree_skb(skb); return -1; } #ifdef CONFIG_SYSCTL static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* secret interval has been deprecated */ static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table); table = lowpan_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table_size = 0; } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; table[1].data = &ieee802154_lowpan->fqdir->low_thresh; table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; table[2].data = &ieee802154_lowpan->fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table, table_size); if (hdr == NULL) goto err_reg; ieee802154_lowpan->sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *lowpan_ctl_header; static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", lowpan_frags_ctl_table); return lowpan_ctl_header == NULL ? -ENOMEM : 0; } static void lowpan_frags_sysctl_unregister(void) { unregister_net_sysctl_table(lowpan_ctl_header); } #else static inline int lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } static inline int __init lowpan_frags_sysctl_register(void) { return 0; } static inline void lowpan_frags_sysctl_unregister(void) { } #endif static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); int res; res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(ieee802154_lowpan->fqdir); return res; } static void __net_exit lowpan_frags_pre_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); fqdir_pre_exit(ieee802154_lowpan->fqdir); } static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .pre_exit = lowpan_frags_pre_exit_net, .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_lowpan_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params lowpan_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = lowpan_key_hashfn, .obj_hashfn = lowpan_obj_hashfn, .obj_cmpfn = lowpan_obj_cmpfn, .automatic_shrinking = true, }; int __init lowpan_net_frag_init(void) { int ret; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; ret = lowpan_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&lowpan_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); err_sysctl: inet_frags_fini(&lowpan_frags); return ret; } void lowpan_net_frag_exit(void) { lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); inet_frags_fini(&lowpan_frags); } |
| 3 1 2 4 4 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> * * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6 * NAT funded by Astaro. */ #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/types.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <net/addrconf.h> #include <net/checksum.h> #include <net/protocol.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_redirect.h> static unsigned int redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) { return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par)); } static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } static void redirect_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static int redirect_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static unsigned int redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range = { .flags = mr->range[0].flags, .min_proto = mr->range[0].min, .max_proto = mr->range[0].max, }; return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); } static struct xt_target redirect_tg_reg[] __read_mostly = { { .name = "REDIRECT", .family = NFPROTO_IPV6, .revision = 0, .table = "nat", .checkentry = redirect_tg6_checkentry, .destroy = redirect_tg_destroy, .target = redirect_tg6, .targetsize = sizeof(struct nf_nat_range), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "REDIRECT", .family = NFPROTO_IPV4, .revision = 0, .table = "nat", .target = redirect_tg4, .checkentry = redirect_tg4_check, .destroy = redirect_tg_destroy, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init redirect_tg_init(void) { return xt_register_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); } static void __exit redirect_tg_exit(void) { xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg)); } module_init(redirect_tg_init); module_exit(redirect_tg_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); MODULE_ALIAS("ip6t_REDIRECT"); MODULE_ALIAS("ipt_REDIRECT"); |
| 12 853 65 65 907 161 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/gfp_types.h> #include <linux/mmzone.h> #include <linux/topology.h> #include <linux/alloc_tag.h> #include <linux/cleanup.h> #include <linux/sched.h> struct vm_area_struct; struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE); BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >> GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) { /* * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. * alloc_pages_nolock() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level * GFP_$FOO flag for this use in alloc_pages_nolock() as the * regular page allocator doesn't fully support this * allocation mode. */ return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * gfp flag masking for nested internal allocations. * * For code that needs to do allocations inside the public allocation API (e.g. * memory allocation tracking code) the allocations need to obey the caller * allocation context constrains to prevent allocation context mismatches (e.g. * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock * situations. * * It is also assumed that these nested allocations are for internal kernel * object storage purposes only and are not going to be used for DMA, etc. Hence * we strip out all the zone information and leave just the context information * intact. * * Further, internal allocations must fail before the higher level allocation * can fail, so we must make them fail faster and fail silently. We also don't * want them to deplete emergency reserves. Hence nested allocations must be * prepared for these allocations to fail. */ static inline gfp_t gfp_nested_mask(gfp_t flags) { return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) | (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)); } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the case of non-NUMA systems the NODE_DATA() gets optimized to * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); #define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); #define alloc_pages_bulk_mempolicy(...) \ alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ #define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_node(...) \ alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) return; if (node_online(this_node)) return; pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); dump_stack(); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); return __alloc_pages_noprof(gfp_mask, order, nid, NULL); } #define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) static inline struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); return __folio_alloc_noprof(gfp, order, nid, NULL); } #define __folio_alloc_node(...) alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__)) /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node_noprof(nid, gfp_mask, order); } #define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) { return folio_alloc_noprof(gfp, order); } #define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); #define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); #define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) void free_pages_exact(void *virt, size_t size); __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define alloc_pages_exact_nid(...) \ alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); static inline bool gfp_has_io_fs(gfp_t gfp) { return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } /* * Check if the gfp flags allow compaction - GFP_NOIO is a really * tricky context because the migration might require IO. */ static inline bool gfp_compaction_allowed(gfp_t gfp_mask) { return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); } extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC typedef unsigned int __bitwise acr_flags_t; #define ACR_FLAGS_NONE ((__force acr_flags_t)0) // ordinary allocation request #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask); #define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CONTIG_ALLOC static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); return page ? page_folio(page) : NULL; } #else static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, int nid, nodemask_t *node) { return NULL; } #endif /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) #endif /* __LINUX_GFP_H */ |
| 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SECCOMP_H #define _LINUX_SECCOMP_H #include <uapi/linux/seccomp.h> #include <linux/seccomp_types.h> #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ SECCOMP_FILTER_FLAG_LOG | \ SECCOMP_FILTER_FLAG_SPEC_ALLOW | \ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \ SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) /* sizeof() the first published struct seccomp_notif_addfd */ #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 #define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 #ifdef CONFIG_SECCOMP #include <linux/thread_info.h> #include <linux/atomic.h> #include <asm/seccomp.h> extern int __secure_computing(void); #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) return __secure_computing(); return 0; } #else extern void secure_computing_strict(int this_syscall); #endif extern long prctl_get_seccomp(void); extern long prctl_set_seccomp(unsigned long, void __user *); static inline int seccomp_mode(struct seccomp *s) { return s->mode; } #else /* CONFIG_SECCOMP */ #include <linux/errno.h> struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif static inline int __secure_computing(void) { return 0; } static inline long prctl_get_seccomp(void) { return -EINVAL; } static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3) { return -EINVAL; } static inline int seccomp_mode(struct seccomp *s) { return SECCOMP_MODE_DISABLED; } #endif /* CONFIG_SECCOMP */ #ifdef CONFIG_SECCOMP_FILTER extern void seccomp_filter_release(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); #else /* CONFIG_SECCOMP_FILTER */ static inline void seccomp_filter_release(struct task_struct *tsk) { return; } static inline void get_seccomp_filter(struct task_struct *tsk) { return; } #endif /* CONFIG_SECCOMP_FILTER */ #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) extern long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, void __user *data); extern long seccomp_get_metadata(struct task_struct *task, unsigned long filter_off, void __user *data); #else static inline long seccomp_get_filter(struct task_struct *task, unsigned long n, void __user *data) { return -EINVAL; } static inline long seccomp_get_metadata(struct task_struct *task, unsigned long filter_off, void __user *data) { return -EINVAL; } #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ #ifdef CONFIG_SECCOMP_CACHE_DEBUG struct seq_file; struct pid_namespace; struct pid; int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); #endif #endif /* _LINUX_SECCOMP_H */ |
| 170 25 2 2 134 11 2 103 2 41 132 138 11 11 89 89 42 42 120 22 101 5 102 66 66 10 10 48 7 36 24 24 201 153 1 48 1 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> struct nft_limit { spinlock_t lock; u64 last; u64 tokens; }; struct nft_limit_priv { struct nft_limit *limit; u64 tokens_max; u64 rate; u64 nsecs; u32 burst; bool invert; }; static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) { u64 now, tokens; s64 delta; spin_lock_bh(&priv->limit->lock); now = ktime_get_ns(); tokens = priv->limit->tokens + now - priv->limit->last; if (tokens > priv->tokens_max) tokens = priv->tokens_max; priv->limit->last = now; delta = tokens - cost; if (delta >= 0) { priv->limit->tokens = delta; spin_unlock_bh(&priv->limit->lock); return priv->invert; } priv->limit->tokens = tokens; spin_unlock_bh(&priv->limit->lock); return !priv->invert; } /* Use same default as in iptables. */ #define NFT_LIMIT_PKT_BURST_DEFAULT 5 static int nft_limit_init(struct nft_limit_priv *priv, const struct nlattr * const tb[], bool pkts) { u64 unit, tokens, rate_with_burst; bool invert = false; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); if (priv->rate == 0) return -EINVAL; unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) return -EOVERFLOW; if (tb[NFTA_LIMIT_BURST]) priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); if (pkts && priv->burst == 0) priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) return -EOVERFLOW; if (pkts) { u64 tmp = div64_u64(priv->nsecs, priv->rate); if (check_mul_overflow(tmp, priv->burst, &tokens)) return -EOVERFLOW; } else { u64 tmp; /* The token bucket size limits the number of tokens can be * accumulated. tokens_max specifies the bucket size. * tokens_max = unit * (rate + burst) / rate. */ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) return -EOVERFLOW; tokens = div64_u64(tmp, priv->rate); } if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); if (flags & ~NFT_LIMIT_F_INV) return -EOPNOTSUPP; if (flags & NFT_LIMIT_F_INV) invert = true; } priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT); if (!priv->limit) return -ENOMEM; priv->limit->tokens = tokens; priv->tokens_max = priv->limit->tokens; priv->invert = invert; priv->limit->last = ktime_get_ns(); spin_lock_init(&priv->limit->lock); return 0; } static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv, enum nft_limit_type type) { u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_limit_destroy(const struct nft_ctx *ctx, const struct nft_limit_priv *priv) { kfree(priv->limit); } static int nft_limit_clone(struct nft_limit_priv *priv_dst, const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; priv_dst->nsecs = priv_src->nsecs; priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; spin_lock_init(&priv_dst->limit->lock); priv_dst->limit->tokens = priv_src->tokens_max; priv_dst->limit->last = ktime_get_ns(); return 0; } struct nft_limit_priv_pkts { struct nft_limit_priv limit; u64 cost; }; static void nft_limit_pkts_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, [NFTA_LIMIT_FLAGS] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); int err; err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate); return 0; } static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); nft_limit_destroy(ctx, &priv->limit); } static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, .destroy = nft_limit_pkts_destroy, .clone = nft_limit_pkts_clone, .dump = nft_limit_pkts_dump, .reduce = NFT_REDUCE_READONLY, }; static void nft_limit_bytes_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv *priv = nft_expr_priv(expr); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_bytes_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_init(priv, tb, false); } static int nft_limit_bytes_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_limit_priv *priv = nft_expr_priv(expr); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_limit_priv *priv = nft_expr_priv(expr); nft_limit_destroy(ctx, priv); } static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)), .eval = nft_limit_bytes_eval, .init = nft_limit_bytes_init, .dump = nft_limit_bytes_dump, .clone = nft_limit_bytes_clone, .destroy = nft_limit_bytes_destroy, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops * nft_limit_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_LIMIT_TYPE] == NULL) return &nft_limit_pkts_ops; switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { case NFT_LIMIT_PKTS: return &nft_limit_pkts_ops; case NFT_LIMIT_PKT_BYTES: return &nft_limit_bytes_ops; } return ERR_PTR(-EOPNOTSUPP); } static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", .select_ops = nft_limit_select_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, .owner = THIS_MODULE, }; static void nft_limit_obj_pkts_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); int err; err = nft_limit_init(&priv->limit, tb, true); if (err < 0) return err; priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate); return 0; } static int nft_limit_obj_pkts_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_limit_priv_pkts *priv = nft_obj_data(obj); return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_limit_priv_pkts *priv = nft_obj_data(obj); nft_limit_destroy(ctx, &priv->limit); } static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_pkts_ops = { .type = &nft_limit_obj_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)), .init = nft_limit_obj_pkts_init, .destroy = nft_limit_obj_pkts_destroy, .eval = nft_limit_obj_pkts_eval, .dump = nft_limit_obj_pkts_dump, }; static void nft_limit_obj_bytes_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_limit_priv *priv = nft_obj_data(obj); u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); if (nft_limit_eval(priv, cost)) regs->verdict.code = NFT_BREAK; } static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_init(priv, tb, false); } static int nft_limit_obj_bytes_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_limit_priv *priv = nft_obj_data(obj); return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); } static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_limit_priv *priv = nft_obj_data(obj); nft_limit_destroy(ctx, priv); } static struct nft_object_type nft_limit_obj_type; static const struct nft_object_ops nft_limit_obj_bytes_ops = { .type = &nft_limit_obj_type, .size = sizeof(struct nft_limit_priv), .init = nft_limit_obj_bytes_init, .destroy = nft_limit_obj_bytes_destroy, .eval = nft_limit_obj_bytes_eval, .dump = nft_limit_obj_bytes_dump, }; static const struct nft_object_ops * nft_limit_obj_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (!tb[NFTA_LIMIT_TYPE]) return &nft_limit_obj_pkts_ops; switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { case NFT_LIMIT_PKTS: return &nft_limit_obj_pkts_ops; case NFT_LIMIT_PKT_BYTES: return &nft_limit_obj_bytes_ops; } return ERR_PTR(-EOPNOTSUPP); } static struct nft_object_type nft_limit_obj_type __read_mostly = { .select_ops = nft_limit_obj_select_ops, .type = NFT_OBJECT_LIMIT, .maxattr = NFTA_LIMIT_MAX, .policy = nft_limit_policy, .owner = THIS_MODULE, }; static int __init nft_limit_module_init(void) { int err; err = nft_register_obj(&nft_limit_obj_type); if (err < 0) return err; err = nft_register_expr(&nft_limit_type); if (err < 0) goto err1; return 0; err1: nft_unregister_obj(&nft_limit_obj_type); return err; } static void __exit nft_limit_module_exit(void) { nft_unregister_expr(&nft_limit_type); nft_unregister_obj(&nft_limit_obj_type); } module_init(nft_limit_module_init); module_exit(nft_limit_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("limit"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT); MODULE_DESCRIPTION("nftables limit expression support"); |
| 23 23 23 23 23 23 23 65 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 | /* * Copyright (c) 2014 Chelsio, Inc. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "iwpm_util.h" #define IWPM_MAPINFO_HASH_SIZE 512 #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) #define IWPM_REMINFO_HASH_SIZE 64 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) #define IWPM_MSG_SIZE 512 static LIST_HEAD(iwpm_nlmsg_req_list); static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); static struct hlist_head *iwpm_hash_bucket; static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); static struct iwpm_admin_data iwpm_admin; /** * iwpm_init - Allocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes up. */ int iwpm_init(u8 nl_client) { iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_hash_bucket) return -ENOMEM; iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_reminfo_bucket) { kfree(iwpm_hash_bucket); return -ENOMEM; } iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); return 0; } static void free_hash_bucket(void); static void free_reminfo_bucket(void); /** * iwpm_exit - Deallocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes down. */ int iwpm_exit(u8 nl_client) { free_hash_bucket(); free_reminfo_bucket(); pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); /** * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address * info in a hash table * @local_sockaddr: Local ip/tcp address * @mapped_sockaddr: Mapped local ip/tcp address * @nl_client: The index of the netlink client * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; unsigned long flags; int ret = -EINVAL; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; memcpy(&map_info->local_sockaddr, local_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( &map_info->local_sockaddr, &map_info->mapped_sockaddr); if (hash_bucket_head) { hlist_add_head(&map_info->hlist_node, hash_bucket_head); ret = 0; } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); if (!hash_bucket_head) kfree(map_info); return ret; } /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address * info from the hash table * @local_sockaddr: Local ip/tcp address * @mapped_local_addr: Mapped local ip/tcp address * * Returns err code if mapping info is not found in the hash table, * otherwise returns 0 */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_mapping_info *map_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( local_sockaddr, mapped_local_addr); if (!hash_bucket_head) goto remove_mapinfo_exit; hlist_for_each_entry_safe(map_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, mapped_local_addr)) { hlist_del_init(&map_info->hlist_node); kfree(map_info); ret = 0; break; } } } remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } static void free_hash_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_mapping_info *map_info; unsigned long flags; int i; /* remove all the mapinfo data from the list */ spin_lock_irqsave(&iwpm_mapinfo_lock, flags); for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(map_info, tmp_hlist_node, &iwpm_hash_bucket[i], hlist_node) { hlist_del_init(&map_info->hlist_node); kfree(map_info); } } /* free the hash list */ kfree(iwpm_hash_bucket); iwpm_hash_bucket = NULL; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); } static void free_reminfo_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_remote_info *rem_info; unsigned long flags; int i; /* remove all the remote info from the list */ spin_lock_irqsave(&iwpm_reminfo_lock, flags); for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(rem_info, tmp_hlist_node, &iwpm_reminfo_bucket[i], hlist_node) { hlist_del_init(&rem_info->hlist_node); kfree(rem_info); } } /* free the hash list */ kfree(iwpm_reminfo_bucket); iwpm_reminfo_bucket = NULL; spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) { struct hlist_head *hash_bucket_head; unsigned long flags; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( &rem_info->mapped_loc_sockaddr, &rem_info->mapped_rem_sockaddr); if (hash_bucket_head) hlist_add_head(&rem_info->hlist_node, hash_bucket_head); } spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } /** * iwpm_get_remote_info - Get the remote connecting peer address info * * @mapped_loc_addr: Mapped local address of the listening peer * @mapped_rem_addr: Mapped remote address of the connecting peer * @remote_addr: To store the remote address of the connecting peer * @nl_client: The index of the netlink client * * The remote address info is retrieved and provided to the client in * the remote_addr. After that it is removed from the hash table */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_remote_info *rem_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( mapped_loc_addr, mapped_rem_addr); if (!hash_bucket_head) goto get_remote_info_exit; hlist_for_each_entry_safe(rem_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, mapped_loc_addr) && !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr, mapped_rem_addr)) { memcpy(remote_addr, &rem_info->remote_sockaddr, sizeof(struct sockaddr_storage)); iwpm_print_sockaddr(remote_addr, "get_remote_info: Remote sockaddr:"); hlist_del_init(&rem_info->hlist_node); kfree(rem_info); ret = 0; break; } } } get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); if (!nlmsg_request) return NULL; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); kref_init(&nlmsg_request->kref); kref_get(&nlmsg_request->kref); nlmsg_request->nlmsg_seq = nlmsg_seq; nlmsg_request->nl_client = nl_client; nlmsg_request->request_done = 0; nlmsg_request->err_code = 0; sema_init(&nlmsg_request->sem, 1); down(&nlmsg_request->sem); return nlmsg_request; } void iwpm_free_nlmsg_request(struct kref *kref) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_del_init(&nlmsg_request->inprocess_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); if (!nlmsg_request->request_done) pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", __func__, nlmsg_request->nlmsg_seq); kfree(nlmsg_request); } struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) { struct iwpm_nlmsg_request *nlmsg_request; struct iwpm_nlmsg_request *found_request = NULL; unsigned long flags; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, inprocess_list) { if (nlmsg_request->nlmsg_seq == echo_seq) { found_request = nlmsg_request; kref_get(&nlmsg_request->kref); break; } } spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); return found_request; } int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) { int ret; ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); if (ret) { ret = -EINVAL; pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); } else { ret = nlmsg_request->err_code; } kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); return ret; } int iwpm_get_nlmsg_seq(void) { return atomic_inc_return(&iwpm_admin.nlmsg_seq); } /* valid client */ u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } /* valid client */ u32 iwpm_check_registration(u8 nl_client, u32 reg) { return (iwpm_get_registration(nl_client) & reg); } int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { if (a_sockaddr->ss_family != b_sockaddr->ss_family) return 1; if (a_sockaddr->ss_family == AF_INET) { struct sockaddr_in *a4_sockaddr = (struct sockaddr_in *)a_sockaddr; struct sockaddr_in *b4_sockaddr = (struct sockaddr_in *)b_sockaddr; if (!memcmp(&a4_sockaddr->sin_addr, &b4_sockaddr->sin_addr, sizeof(struct in_addr)) && a4_sockaddr->sin_port == b4_sockaddr->sin_port) return 0; } else if (a_sockaddr->ss_family == AF_INET6) { struct sockaddr_in6 *a6_sockaddr = (struct sockaddr_in6 *)a_sockaddr; struct sockaddr_in6 *b6_sockaddr = (struct sockaddr_in6 *)b_sockaddr; if (!memcmp(&a6_sockaddr->sin6_addr, &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) return 0; } else { pr_err("%s: Invalid sockaddr family\n", __func__); } return 1; } struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, int nl_client) { struct sk_buff *skb = NULL; skb = dev_alloc_skb(IWPM_MSG_SIZE); if (!skb) goto create_nlmsg_exit; if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, NLM_F_REQUEST))) { pr_warn("%s: Unable to put the nlmsg header\n", __func__); dev_kfree_skb(skb); skb = NULL; } create_nlmsg_exit: return skb; } int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, const struct nla_policy *nlmsg_policy, struct nlattr *nltb[], const char *msg_type) { int nlh_len = 0; int ret; const char *err_str = ""; ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; } ret = iwpm_validate_nlmsg_attr(nltb, policy_max); if (ret) { err_str = "Invalid NULL attribute"; goto parse_nlmsg_error; } return 0; parse_nlmsg_error: pr_warn("%s: %s (msg type %s ret = %d)\n", __func__, err_str, msg_type, ret); return ret; } void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) { struct sockaddr_in6 *sockaddr_v6; struct sockaddr_in *sockaddr_v4; switch (sockaddr->ss_family) { case AF_INET: sockaddr_v4 = (struct sockaddr_in *)sockaddr; pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", msg, &sockaddr_v4->sin_addr, ntohs(sockaddr_v4->sin_port), ntohs(sockaddr_v4->sin_port)); break; case AF_INET6: sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", msg, &sockaddr_v6->sin6_addr, ntohs(sockaddr_v6->sin6_port), ntohs(sockaddr_v6->sin6_port)); break; default: break; } } static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) { u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); return hash; } static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) { u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); return hash; } static int get_hash_bucket(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr, u32 *hash) { u32 a_hash, b_hash; if (a_sockaddr->ss_family == AF_INET) { a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr); b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr); } else if (a_sockaddr->ss_family == AF_INET6) { a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr); b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr); } else { pr_err("%s: Invalid sockaddr family\n", __func__); return -EINVAL; } if (a_hash == b_hash) /* if port mapper isn't available */ *hash = a_hash; else *hash = jhash_2words(a_hash, b_hash, 0); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash); if (ret) return NULL; return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK]; } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *mapped_loc_sockaddr, struct sockaddr_storage *mapped_rem_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash); if (ret) return NULL; return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK]; } static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto mapinfo_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); msg_seq = 0; err_str = "Unable to put attribute of mapinfo number nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); if (ret) goto mapinfo_num_error; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) { struct nlmsghdr *nlh = NULL; int ret = 0; if (!skb) return ret; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; } int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) { struct iwpm_mapping_info *map_info; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int skb_num = 0, mapping_num = 0; int i = 0, nlmsg_bytes = 0; unsigned long flags; const char *err_str = ""; int ret; skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { if (map_info->nl_client != nl_client) continue; nlh = NULL; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { ret = -ENOMEM; err_str = "Unable to put the nlmsg header"; goto send_mapping_info_unlock; } err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->local_sockaddr, IWPM_NLA_MAPINFO_LOCAL_ADDR); if (ret) goto send_mapping_info_unlock; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->mapped_sockaddr, IWPM_NLA_MAPINFO_MAPPED_ADDR); if (ret) goto send_mapping_info_unlock; if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &map_info->map_flags, IWPM_NLA_MAPINFO_FLAGS); if (ret) goto send_mapping_info_unlock; } nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, "send_mapping_info: Mapped local sockaddr:"); mapping_num++; nlmsg_bytes += nlh->nlmsg_len; /* check if all mappings can fit in one skb */ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { /* and leave room for NLMSG_DONE */ nlmsg_bytes = 0; skb_num++; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); /* send the skb */ ret = send_nlmsg_done(skb, nl_client, iwpm_pid); skb = NULL; if (ret) { err_str = "Unable to send map info"; goto send_mapping_info_exit; } if (skb_num == IWPM_MAPINFO_SKB_COUNT) { ret = -ENOMEM; err_str = "Insufficient skbs for map info"; goto send_mapping_info_exit; } skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } spin_lock_irqsave(&iwpm_mapinfo_lock, flags); } } } send_mapping_info_unlock: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); } int iwpm_mapinfo_available(void) { unsigned long flags; int full_bucket = 0, i = 0; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { if (!hlist_empty(&iwpm_hash_bucket[i])) { full_bucket = 1; break; } } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; const char *err_str; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto hello_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); err_str = "Unable to put attribute of abi_version into nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, IWPM_NLA_HELLO_ABI_VERSION); if (ret) goto hello_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto hello_num_error; } pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); return 0; hello_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } |
| 286 51 1 20 6 13 52 115 5 1 104 2342 3 3 2 2 1 1 1 1 53 68 59 59 59 97 123 1966 208 208 208 2 35 35 207 156 99 270 59 68 68 59 59 2 6 16 51 58 59 59 28 39 59 2 2 38 68 68 52 36 36 36 68 68 9 8 8 67 66 138 138 138 110 74 39 3 354 75 74 1 1 36 36 36 2 112 278 10 2 89 4 2 2 9 1 4 5 2185 2188 1 33 6 21 1244 1966 1964 1422 1753 2 2 3 1502 353 2163 23 2186 267 58 209 1 12 265 12 2 19 19 19 19 19 54 6 158 162 54 231 49 231 226 9 267 272 270 271 231 49 229 49 272 266 9 272 230 49 270 271 138 147 272 271 159 122 123 123 14 109 80 2 15 15 54 80 20 20 20 4 12 16 98 98 95 43 1 54 89 88 35 3 3 3 16 12 26 26 97 39 10 1772 1773 98 98 47 53 48 70 69 138 117 98 47 104 10 103 28 13 103 103 99 99 35 29 61 51 2 51 3 11 50 3 50 3 51 2 50 56 20 112 1 112 1 111 75 44 3 41 324 17 319 338 7 18 2 3 329 6 263 4 59 321 89 273 56 9 51 55 322 2 2 270 1 2 3 1 80 321 56 336 340 339 7 3 1 1 204 54 11 139 123 55 55 4 131 94 74 94 65 44 1 18 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 | // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <linux/capability.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal */ bool dying; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; u8 loop_check_depth; /* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct */ refcount_t refcount; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; /* busy poll timeout */ u32 busy_poll_usecs; /* busy poll packet budget */ u16 busy_poll_budget; bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, }; static void __init epoll_sysctls_init(void) { register_sysctl("fs/epoll", epoll_table); } #else #define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL /** * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value * from the epoll instance ep is preferred, but if it is not set fallback to * the system-wide global via busy_loop_timeout. * * @start_time: The start time used to compute the remaining time until timeout. * @ep: Pointer to the eventpoll context. * * Return: true if the timeout has expired, false otherwise. */ static bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep) { unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } else { return busy_loop_timeout(start_time); } } static bool ep_busy_loop_on(struct eventpoll *ep) { return !!READ_ONCE(ep->busy_poll_usecs) || READ_ONCE(ep->prefer_busy_poll) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (!budget) budget = BUSY_POLL_BUDGET; if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ if (prefer_busy_poll) napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct eventpoll *ep = file->private_data; void __user *uarg = (void __user *)arg; struct epoll_params epoll_params; switch (cmd) { case EPIOCSPARAMS: if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) return -EFAULT; /* pad byte must be zero */ if (epoll_params.__pad) return -EINVAL; if (epoll_params.busy_poll_usecs > S32_MAX) return -EINVAL; if (epoll_params.prefer_busy_poll > 1) return -EINVAL; if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && !capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); return 0; case EPIOCGPARAMS: memset(&epoll_params, 0, sizeof(epoll_params)); epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) return -EFAULT; return 0; default: return -ENOIOCTLCMD; } } static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } static void ep_suspend_napi_irqs(struct eventpoll *ep) { } static void ep_resume_napi_irqs(struct eventpoll *ep) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); } /* * Returns true if the event poll can be disposed */ static bool ep_refcount_dec_and_test(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); return true; } static void ep_free(struct eventpoll *ep) { ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed. */ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (epi->dying && !force) { spin_unlock(&file->f_lock); return false; } to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return true; } /* * ep_remove variant for callers owing an additional reference to the ep */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { if (__ep_remove(ep, epi, false)) WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove_safe(ep, epi); cond_resched(); } mutex_unlock(&ep->mtx); if (ep_refcount_dec_and_test(ep)) ep_free(ep); } static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; if (!is_file_epoll(file)) return -EINVAL; switch (cmd) { case EPIOCSPARAMS: case EPIOCGPARAMS: ret = ep_eventpoll_bp_ioctl(file, cmd, arg); break; default: ret = -EINVAL; break; } return ret; } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_clear_and_put(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc). */ static struct file *epi_fget(const struct epitem *epi) { struct file *file; file = epi->ffd.file; if (!file_ref_get(&file->f_ref)) file = NULL; return file; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi_fget(epi); __poll_t res; /* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen". */ if (!file) return 0; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); fput(file); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, .unlocked_ioctl = ep_eventpoll_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); epi->dying = true; spin_unlock(&file->f_lock); /* * ep access is safe as we still own a reference to the ep * struct */ ep = epi->ep; mutex_lock(&ep->mtx); dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) return -ENOMEM; mutex_init(&ep->mtx); spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = READ_ONCE(ep->ovflist); WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove_safe(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove_safe(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } static int ep_try_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int res; /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ res = ep_send_events(ep, events, maxevents); if (res > 0) ep_suspend_napi_irqs(ep); return res; } static int ep_schedule_timeout(ktime_t *to) { if (to) return ktime_after(*to, ktime_get()); else return 1; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { res = ep_try_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); spin_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file @ep inside another * epoll file does not create closed loops, and * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int result = 0; struct rb_node *rbp; struct epitem *epi; if (ep->gen == loop_check_gen) return ep->loop_check_depth; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) result = INT_MAX; else result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); if (result > EP_MAX_NESTS) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } ep->loop_check_depth = result; mutex_unlock(&ep->mtx); return result; } /* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) { int result = 0; struct epitem *epi; if (ep->gen == loop_check_gen) return ep->loop_check_depth; hlist_for_each_entry_rcu(epi, &ep->refs, fllink) result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); ep->gen = loop_check_gen; ep->loop_check_depth = result; return result; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { int depth, upwards_depth; inserting_into = ep; /* * Check how deep down we can get from @to, and whether it is possible * to loop up to @ep. */ depth = ep_loop_check_proc(to, 0); if (depth > EP_MAX_NESTS) return -1; /* Check how far up we can go from @ep. */ rcu_read_lock(); upwards_depth = ep_get_upwards_depth_proc(ep, 0); rcu_read_unlock(); return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_clear_and_put(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; /* Get the "struct file *" for the target file */ CLASS(fd, tf)(fd); if (fd_empty(tf)) return -EBADF; /* The target file descriptor must support poll */ if (!file_can_poll(fd_file(tf))) return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(fd_file(tf))) { tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here. */ ep_remove_safe(ep, epi); error = 0; } else { error = -ENOENT; } break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epnested_mutex); } return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } static int ep_check_params(struct file *file, struct epoll_event __user *evs, int maxevents) { /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ if (!is_file_epoll(file)) return -EINVAL; return 0; } int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents) { struct eventpoll *ep; int ret; ret = ep_check_params(file, events, maxevents); if (unlikely(ret)) return ret; ep = file->private_data; /* * Racy call, but that's ok - it should get retried based on * poll readiness anyway. */ if (ep_events_available(ep)) return ep_try_send_events(ep, events, maxevents); return 0; } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; ret = ep_check_params(fd_file(f), events, maxevents); if (unlikely(ret)) return ret; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* Time to fish for events ... */ return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init); |
| 5685 5685 3035 3035 149 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/dir.c - sysfs core and dir operation implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt #include <linux/fs.h> #include <linux/kobject.h> #include <linux/slab.h> #include "sysfs.h" DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) kernfs_path(parent, buf, PATH_MAX); pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name); dump_stack(); kfree(buf); } /** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { struct kernfs_node *parent, *kn; kuid_t uid; kgid_t gid; if (WARN_ON(!kobj)) return -EINVAL; if (kobj->parent) parent = kobj->parent->sd; else parent = sysfs_root_kn; if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn); } kobj->sd = kn; return 0; } /** * sysfs_remove_dir - remove an object's directory. * @kobj: object. * * The only thing special about this is that we remove any files in * the directory before we remove the directory, and we've inlined * what used to be sysfs_rmdir() below, instead of calling separately. */ void sysfs_remove_dir(struct kobject *kobj) { struct kernfs_node *kn = kobj->sd; /* * In general, kobject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no * control over removal. @kobj->sd may be removed anytime * and symlink code may end up dereferencing an already freed node. * * sysfs_symlink_target_lock synchronizes @kobj->sd * disassociation against symlink operations so that symlink code * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); if (kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); kernfs_remove(kn); } } int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { struct kernfs_node *parent; int ret; parent = kernfs_get_parent(kobj->sd); ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); kernfs_put(parent); return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; return kernfs_rename_ns(kn, new_parent, NULL, new_ns); } /** * sysfs_create_mount_point - create an always empty directory * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to add */ int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *kn, *parent = parent_kobj->sd; kn = kernfs_create_empty_dir(parent, name); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, name); return PTR_ERR(kn); } return 0; } EXPORT_SYMBOL_GPL(sysfs_create_mount_point); /** * sysfs_remove_mount_point - remove an always empty directory. * @parent_kobj: kobject that will contain this always empty directory * @name: The name of the always empty directory to remove * */ void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { struct kernfs_node *parent = parent_kobj->sd; kernfs_remove_by_name_ns(parent, name, NULL); } EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); |
| 557 949 694 313 95 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H #include <crypto/algapi.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/jump_label.h> #include <linux/list.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/numa.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/types.h> struct crypto_instance; struct crypto_template; struct crypto_larval { struct crypto_alg alg; struct crypto_alg *adult; struct completion completion; u32 mask; bool test_started; }; struct crypto_type { unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask); unsigned int (*extsize)(struct crypto_alg *alg); int (*init_tfm)(struct crypto_tfm *tfm); void (*show)(struct seq_file *m, struct crypto_alg *alg); int (*report)(struct sk_buff *skb, struct crypto_alg *alg); void (*free)(struct crypto_instance *inst); void (*destroy)(struct crypto_alg *alg); unsigned int type; unsigned int maskclear; unsigned int maskset; unsigned int tfmsize; unsigned int algsize; }; enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, CRYPTOA_TYPE, __CRYPTOA_MAX, }; #define CRYPTOA_MAX (__CRYPTOA_MAX - 1) /* Maximum number of (rtattr) parameters for each template. */ #define CRYPTO_MAX_ATTRS 32 extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; int alg_test(const char *driver, const char *alg, u32 type, u32 mask); #if !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || !IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) static inline bool crypto_boot_test_finished(void) { return true; } static inline void set_crypto_boot_test_finished(void) { } #else DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished); static inline bool crypto_boot_test_finished(void) { return static_branch_likely(&__crypto_boot_test_finished); } static inline void set_crypto_boot_test_finished(void) { static_branch_enable(&__crypto_boot_test_finished); } #endif /* !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || * !IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) */ #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); void __exit crypto_exit_proc(void); #else static inline void crypto_init_proc(void) { } static inline void crypto_exit_proc(void) { } #endif static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } struct crypto_alg *crypto_mod_get(struct crypto_alg *alg); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_schedule_test(struct crypto_larval *larval); void crypto_alg_tested(const char *name, int err); void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg); void crypto_remove_final(struct list_head *list); void crypto_shoot_alg(struct crypto_alg *alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask); void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm); static inline void *crypto_create_tfm(struct crypto_alg *alg, const struct crypto_type *frontend) { return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE); } struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask); void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node); static inline void *crypto_alloc_tfm(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE); } int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask); static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { refcount_inc(&alg->cra_refcnt); return alg; } void crypto_destroy_alg(struct crypto_alg *alg); static inline void crypto_alg_put(struct crypto_alg *alg) { if (refcount_dec_and_test(&alg->cra_refcnt)) crypto_destroy_alg(alg); } static inline int crypto_tmpl_get(struct crypto_template *tmpl) { return try_module_get(tmpl->module); } static inline void crypto_tmpl_put(struct crypto_template *tmpl) { module_put(tmpl->module); } static inline int crypto_is_larval(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_LARVAL; } static inline int crypto_is_dead(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_DEAD; } static inline int crypto_is_moribund(struct crypto_alg *alg) { return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING); } static inline void crypto_notify(unsigned long val, void *v) { blocking_notifier_call_chain(&crypto_chain, val, v); } static inline void crypto_yield(u32 flags) { if (flags & CRYPTO_TFM_REQ_MAY_SLEEP) cond_resched(); } static inline int crypto_is_test_larval(struct crypto_larval *larval) { return larval->alg.cra_driver_name[0]; } static inline struct crypto_tfm *crypto_tfm_get(struct crypto_tfm *tfm) { return refcount_inc_not_zero(&tfm->refcnt) ? tfm : ERR_PTR(-EOVERFLOW); } #endif /* _CRYPTO_INTERNAL_H */ |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the IPPROTO_SMC (socket related) * * Copyright IBM Corp. 2016, 2018 * Copyright (c) 2024, Alibaba Inc. * * Author: D. Wythe <alibuda@linux.alibaba.com> */ #include <net/protocol.h> #include <net/sock.h> #include "smc_inet.h" #include "smc.h" static int smc_inet_init_sock(struct sock *sk); static struct proto smc_inet_prot = { .name = "INET_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; static const struct proto_ops smc_inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, }; #if IS_ENABLED(CONFIG_IPV6) struct smc6_sock { struct smc_sock smc; struct ipv6_pinfo inet6; }; static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, .init = smc_inet_init_sock, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static struct inet_protosw smc_inet6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, }; #endif /* CONFIG_IPV6 */ static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } int __init smc_inet_init(void) { int rc; rc = proto_register(&smc_inet_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet_prot fails with %d\n", __func__, rc); return rc; } /* no return value */ inet_register_protosw(&smc_inet_protosw); #if IS_ENABLED(CONFIG_IPV6) rc = proto_register(&smc_inet6_prot, 1); if (rc) { pr_err("%s: proto_register smc_inet6_prot fails with %d\n", __func__, rc); goto out_inet6_prot; } rc = inet6_register_protosw(&smc_inet6_protosw); if (rc) { pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", __func__, rc); goto out_inet6_protosw; } return rc; out_inet6_protosw: proto_unregister(&smc_inet6_prot); out_inet6_prot: inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); #endif /* CONFIG_IPV6 */ return rc; } void smc_inet_exit(void) { #if IS_ENABLED(CONFIG_IPV6) inet6_unregister_protosw(&smc_inet6_protosw); proto_unregister(&smc_inet6_prot); #endif /* CONFIG_IPV6 */ inet_unregister_protosw(&smc_inet_protosw); proto_unregister(&smc_inet_prot); } |
| 168 168 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Do sleep inside a spin-lock * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> */ #include <linux/export.h> #include <sound/core.h> #include "seq_lock.h" /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { int warn_count = 5 * HZ; if (atomic_read(lockp) < 0) { pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line); return; } while (atomic_read(lockp) > 0) { if (warn_count-- == 0) pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line); schedule_timeout_uninterruptible(1); } } EXPORT_SYMBOL(snd_use_lock_sync_helper); |
| 217 3 6 216 216 84 84 84 7 7 84 7 84 7 221 87 216 87 216 97 97 97 3 3 3 3 3 3 125 85 85 3 25 148 82 119 11 38 2 2 2 10 2 12 12 12 98 12 12 79 6 9 7 8 15 11 47 45 86 2 11 89 94 95 95 95 14 2 1 31 3 3 5 1 1 27 1 25 6 12 14 13 14 2 10 3 58 58 46 46 10 4 4 10 7 6 9 6 6 11 13 14 2 1 11 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 7 7 3 3 71 2 3 3 4 4 3 1 74 1 74 2 74 74 4 5 70 73 8 4 4 8 60 58 4 60 60 1 2 2 3 9 7 3 60 57 10 2 59 8 60 60 60 8 6 61 61 9 9 1 60 85 16 21 1 61 7 76 7 1 2 82 62 82 60 2178 8 4 108 9 10 10 9 5 10 10 7 3 10 3 7 10 10 51 3 42 34 53 2 7 48 98 7 27 80 51 55 92 41 8 85 77 38 37 10 105 41 98 13 5 4 1 2 10 1 3 2 3 1 6 6 3 3 2 132 35 3 84 80 137 4 2 6 1 17 1 19 12 1 12 9 10 19 2 2 4 81 39 2 1 8 1 3 25 1 1 10 24 1 32 1 15 87 21 4 8 19 176 2 6 168 20 119 2 120 111 111 13 13 3 5 1 17 1 4 7 4 4 14 2 5 7 1 1 3 1 2 2 1 1 18 4 2 5 14 5 2 6 15 15 1 7 15 15 7 7 4 3 23 23 5 5 3 2 16 6 2 1 7 21 11 5 5 14 1 29 29 3 2 2 1 6 8 1 16 16 417 1 44 25 4686 4565 2032 955 240 422 54 723 723 125 723 723 154 153 154 217 107 217 10 10 10 9 9 9 9 573 572 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 | // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); spin_lock_init(&nh->lock); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; u16 weight; int i; *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { weight = nhg->nh_entries[i].weight - 1; *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, .weight = weight, .weight_high = weight >> 8, }; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh) + nla_total_size(4) + /* NHA_OP_FLAGS */ 0; else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); struct nexthop_grp *nhg; unsigned int i, j; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } if (nexthop_grp_weight(&nhg[i]) == 0) { /* 0xffff got passed in, representing weight of 0x10000, * which is too heavy. */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } nhg = nla_data(tb[NHA_GROUP]); for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int len; unsigned int i; u8 nhg_fdb; len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); nhg_fdb = !!tb[NHA_FDB]; for (i = 0; i < len; i++) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rcu_dereference_rtnl(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { u16 prev_upper_bound = 0; u32 total = 0; u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u16 upper_bound; u64 btw; w += nhge->weight; btw = ((u64)res_table->num_nh_buckets) * w; upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { u32 total = 0; u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; /* If there is nothing to do, let's avoid the costly call to * synchronize_net() */ if (list_empty(&nh->grp_list)) return; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); spin_lock_bh(&nh->lock); nh->dead = true; while (!list_empty(&nh->f6i_list)) { f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); spin_lock_bh(&nh->lock); } spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } if (!list_empty(&old->grp_list) && rtnl_dereference(new->nh_info)->fdb_nh != rtnl_dereference(old->nh_info)->fdb_nh) { NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **tb, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); int err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb, struct nh_config *cfg, struct netlink_ext_ack *extack) { if (tb[NHA_GROUP]) return nh_check_attr_group_rtnl(net, tb, extack); if (tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); return -ENETDOWN; } } return 0; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) goto out; err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack); if (err) goto out; if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); err = -EINVAL; goto out; } rtnl_net_lock(net); err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack); if (err) goto unlock; nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); unlock: rtnl_net_unlock(net); out: return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; rtnl_net_lock(net); nh = nexthop_find_by_id(net, id); if (nh) remove_nexthop(net, nh, &nlinfo); else err = -ENOENT; rtnl_net_unlock(net); return err; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; /* If this is not the first invocation, ctx->idx will contain the id of * the last nexthop we processed. Instead of starting from the very * first element of the red/black tree again and linearly skipping the * (potentially large) set of nodes with an id smaller than s_idx, walk * the tree and find the left-most node whose id is >= s_idx. This * provides an efficient O(log n) starting point for the dump * continuation. */ if (s_idx != 0) { struct rb_node *tmp = root->rb_node; node = NULL; while (tmp) { struct nexthop *nh; nh = rb_entry(tmp, struct nexthop, rb_node); if (nh->id < s_idx) { tmp = tmp->rb_right; } else { /* Track current candidate and keep looking on * the left side to find the left-most * (smallest id) that is still >= s_idx. */ node = tmp; tmp = tmp->rb_left; } } } else { node = rb_first(root); } for (; node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ASSERT_RTNL_NET(net); flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register_many(nexthop_rtnl_msg_handlers); return 0; } subsys_initcall(nexthop_init); |
| 25 1 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | // SPDX-License-Identifier: GPL-2.0-only /* * netfilter module to enforce network quotas * * Sam Johnston <samj@samj.net> */ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_quota.h> #include <linux/module.h> struct xt_quota_priv { spinlock_t lock; uint64_t quota; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); MODULE_DESCRIPTION("Xtables: countdown quota match"); MODULE_ALIAS("ipt_quota"); MODULE_ALIAS("ip6t_quota"); static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct xt_quota_info *q = (void *)par->matchinfo; struct xt_quota_priv *priv = q->master; bool ret = q->flags & XT_QUOTA_INVERT; spin_lock_bh(&priv->lock); if (priv->quota >= skb->len) { priv->quota -= skb->len; ret = !ret; } else { /* we do not allow even small packets from now on */ priv->quota = 0; } spin_unlock_bh(&priv->lock); return ret; } static int quota_mt_check(const struct xt_mtchk_param *par) { struct xt_quota_info *q = par->matchinfo; if (q->flags & ~XT_QUOTA_MASK) return -EINVAL; q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); if (q->master == NULL) return -ENOMEM; spin_lock_init(&q->master->lock); q->master->quota = q->quota; return 0; } static void quota_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_quota_info *q = par->matchinfo; kfree(q->master); } static struct xt_match quota_mt_reg __read_mostly = { .name = "quota", .revision = 0, .family = NFPROTO_UNSPEC, .match = quota_mt, .checkentry = quota_mt_check, .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; static int __init quota_mt_init(void) { return xt_register_match("a_mt_reg); } static void __exit quota_mt_exit(void) { xt_unregister_match("a_mt_reg); } module_init(quota_mt_init); module_exit(quota_mt_exit); |
| 45 20 39 35 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */ |
| 721 632 334 3447 2273 13 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_INETDEVICE_H #define _LINUX_INETDEVICE_H #ifdef __KERNEL__ #include <linux/bitmap.h> #include <linux/if.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <linux/timer.h> #include <linux/sysctl.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> struct ipv4_devconf { void *sysctl; int data[IPV4_DEVCONF_MAX]; DECLARE_BITMAP(state, IPV4_DEVCONF_MAX); }; #define MC_HASH_SZ_LOG 9 struct in_device { struct net_device *dev; netdevice_tracker dev_tracker; refcount_t refcnt; int dead; struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ struct ip_mc_list __rcu *mc_list; /* IP multicast filter chain */ struct ip_mc_list __rcu * __rcu *mc_hash; int mc_count; /* Number of installed mcasts */ spinlock_t mc_tomb_lock; struct ip_mc_list *mc_tomb; unsigned long mr_v1_seen; unsigned long mr_v2_seen; unsigned long mr_maxdelay; unsigned long mr_qi; /* Query Interval */ unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ struct neigh_parms *arp_parms; struct ipv4_devconf cnf; struct rcu_head rcu_head; }; #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1]) #define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr)) #define IPV4_DEVCONF_ALL(net, attr) \ IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr) #define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr)) static inline int ipv4_devconf_get(const struct in_device *in_dev, int index) { index--; return READ_ONCE(in_dev->cnf.data[index]); } static inline void ipv4_devconf_set(struct in_device *in_dev, int index, int val) { index--; set_bit(index, in_dev->cnf.state); WRITE_ONCE(in_dev->cnf.data[index], val); } static inline void ipv4_devconf_setall(struct in_device *in_dev) { bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX); } #define IN_DEV_CONF_GET(in_dev, attr) \ ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr) #define IN_DEV_CONF_SET(in_dev, attr, val) \ ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val)) #define IN_DEV_ANDCONF(in_dev, attr) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_NET_ORCONF(in_dev, net, attr) \ (IPV4_DEVCONF_ALL_RO(net, attr) || \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_ORCONF(in_dev, attr) \ IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr) #define IN_DEV_MAXCONF(in_dev, attr) \ (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \ IN_DEV_CONF_GET((in_dev), attr))) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) #define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_ACCEPT_LOCAL(in_dev) IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL) #define IN_DEV_BOOTP_RELAY(in_dev) IN_DEV_ANDCONF((in_dev), BOOTP_RELAY) #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) #define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ SECURE_REDIRECTS) #define IN_DEV_IDTAG(in_dev) IN_DEV_CONF_GET(in_dev, TAG) #define IN_DEV_MEDIUM_ID(in_dev) IN_DEV_CONF_GET(in_dev, MEDIUM_ID) #define IN_DEV_PROMOTE_SECONDARIES(in_dev) \ IN_DEV_ORCONF((in_dev), \ PROMOTE_SECONDARIES) #define IN_DEV_ROUTE_LOCALNET(in_dev) IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET) #define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net) \ IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET) #define IN_DEV_RX_REDIRECTS(in_dev) \ ((IN_DEV_FORWARD(in_dev) && \ IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \ || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) #define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) #define IN_DEV_ARP_NOTIFY(in_dev) IN_DEV_MAXCONF((in_dev), ARP_NOTIFY) #define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \ ARP_EVICT_NOCARRIER) struct in_ifaddr { struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; __be32 ifa_local; __be32 ifa_address; __be32 ifa_mask; __u32 ifa_rt_priority; __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; unsigned char ifa_proto; __u32 ifa_flags; char ifa_label[IFNAMSIZ]; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 ifa_valid_lft; __u32 ifa_preferred_lft; unsigned long ifa_cstamp; /* created timestamp */ unsigned long ifa_tstamp; /* updated timestamp */ }; struct in_validator_info { __be32 ivi_addr; struct in_device *ivi_dev; struct netlink_ext_ack *extack; }; int register_inetaddr_notifier(struct notifier_block *nb); int unregister_inetaddr_notifier(struct notifier_block *nb); int register_inetaddr_validator_notifier(struct notifier_block *nb); int unregister_inetaddr_validator_notifier(struct notifier_block *nb); void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf); struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref); static inline struct net_device *ip_dev_find(struct net *net, __be32 addr) { return __ip_dev_find(net, addr, true); } int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b); int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *); #ifdef CONFIG_INET int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size); #else static inline int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { return 0; } #endif void devinet_init(void); struct in_device *inetdev_by_index(struct net *, int); __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope); __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr); static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa) { return !((addr^ifa->ifa_address)&ifa->ifa_mask); } /* * Check if a mask is acceptable. */ static __inline__ bool bad_mask(__be32 mask, __be32 addr) { __u32 hmask; if (addr & (mask = ~mask)) return true; hmask = ntohl(mask); if (hmask & (hmask+1)) return true; return false; } #define in_dev_for_each_ifa_rtnl(ifa, in_dev) \ for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) #define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) \ for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa; \ ifa = rtnl_net_dereference(net, ifa->ifa_next)) #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->ip_ptr); } static inline struct in_device *in_dev_get(const struct net_device *dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) refcount_inc(&in_dev->refcnt); rcu_read_unlock(); return in_dev; } static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->ip_ptr); } static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip_ptr); } /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { struct in_device *in_dev; bool rc = false; in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rc = true; return rc; } static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); return in_dev ? in_dev->arp_parms : NULL; } void in_dev_finish_destroy(struct in_device *idev); static inline void in_dev_put(struct in_device *idev) { if (refcount_dec_and_test(&idev->refcnt)) in_dev_finish_destroy(idev); } #define __in_dev_put(idev) refcount_dec(&(idev)->refcnt) #define in_dev_hold(idev) refcount_inc(&(idev)->refcnt) #endif /* __KERNEL__ */ static __inline__ __be32 inet_make_mask(int logmask) { if (logmask) return htonl(~((1U<<(32-logmask))-1)); return 0; } static __inline__ int inet_mask_len(__be32 mask) { __u32 hmask = ntohl(mask); if (!hmask) return 0; return 32 - ffz(~hmask); } #endif /* _LINUX_INETDEVICE_H */ |
| 2502 1965 2304 459 2292 2142 2145 1387 2146 2146 1425 2316 768 952 2136 6 2131 1836 1833 2 649 67 1720 2 1631 1719 1686 104 1684 1601 1951 104 2080 1436 885 2029 1352 843 2783 1632 1185 2152 1364 788 1829 1211 615 1535 1041 494 577 267 311 992 627 364 652 427 225 91 53 38 137 53 84 184 1025 548 1053 1054 1054 1055 1055 1056 1056 1056 771 287 713 81 649 2 650 671 186 611 11 110 547 24 8 17 1 16 1 147 3 3 1 269 113 156 4 154 2010 10 2010 311 3 310 68 2 66 121 5 116 2 79 40 79 40 102 102 14 346 26 5 336 336 26 57 44 13 92 1491 1489 71 259 360 356 71 65 107 101 22 99 101 37 64 101 101 101 101 101 101 2265 2265 1324 1327 1326 1256 1257 784 611 63 1257 1257 34 30 2 2 34 1640 1942 1942 1943 1943 65 1 4 1 63 160 6 936 1837 129 6 1912 1877 1880 1880 48 9 10 74 44 99 46 1 9 444 71 1787 273 43 5 15 1 2 268 93 73 2 74 1 36 147 75 94 121 116 85 1 28 7 1 62 6 39 115 38 6 1 1069 753 939 92 936 753 1763 910 779 125 78 50 124 5 697 209 3 1794 35 1616 205 206 205 131 74 1503 1 211 33 8 1705 7 1705 150 1568 118 1593 1652 22 36 53 3 1 1827 1821 6 1820 1 1 4 1817 105 5 110 110 1820 1805 1805 1805 1803 1807 1807 1806 1806 1807 1807 1803 1806 12 2 2 14 2 15 1 11 11 16 16 16 16 11 11 11 16 15 2 10 6 15 1 16 16 14 2 14 2 14 2 16 6 6 16 16 16 16 16 15 15 14 2 16 16 16 1575 87 127 11 25 1336 1336 1373 1372 1 1373 1461 115 655 656 654 710 54 656 656 656 655 700 33 100 279 289 1135 1145 812 333 1656 1677 1676 1677 1669 7 1665 10 2 1673 1662 9 1616 55 2 1664 1663 3 1658 4 4 1659 1119 104 1 438 829 827 1657 1399 315 933 471 276 1657 1592 64 1593 1556 266 1738 3 1 223 14 3 1606 2 20 191 214 13 210 690 689 24 16 7 8 690 690 1305 1504 8 201 1306 1305 1117 690 1305 1116 690 1113 115 115 115 114 115 115 115 115 115 115 1225 1226 1227 1227 1227 1187 5 36 1561 544 1017 1342 763 360 1717 52 1552 219 1486 285 1580 190 1686 83 1493 149 128 1444 326 292 1478 246 1522 1770 1700 1 553 2 1148 1 2 3 119 1572 1658 1657 1658 60 2 1 3 643 1019 29 14 1 38 4 1770 4 1770 3 1684 1 83 13 417 1340 1746 1688 52 5 1712 2 29 2 1695 46 14 1680 46 11 2 1714 1712 1692 1 1691 1 2 1689 1681 6 1679 1677 2 1676 897 780 1 1677 1677 1675 1678 1 1677 64 64 61 270 1 114 155 29 240 1477 1589 1 2 1 1 1582 1778 2 1 1 1 1 1087 689 1 687 4 1702 1 1 1179 58 112 70 286 1714 1713 585 2 1128 2 2 4 1702 1520 53 1728 1781 1781 2 1774 1770 1769 1763 15 1746 32 10 2 9 14 21 1521 138 849 10 801 894 5 1468 64 188 2 6 1582 84 349 1150 1583 1119 462 1226 18 3 1474 87 205 1320 35 39 1488 35 1491 782 710 16 908 568 1420 1420 803 617 162 162 353 352 1419 1781 1771 8 1781 1420 464 1342 360 1786 96 25 1 71 7 49 5 1 6 6 58 6 2 56 56 6 6 1881 1876 115 1799 1882 1881 1882 1410 566 900 1057 535 985 1881 1878 1880 1881 1881 1877 4 320 34 289 114 85 207 289 3 12 12 12 12 97 9 105 6 99 95 7 2 103 2 105 9 96 19 4 5 38 43 3 27 27 1 26 6 20 71 16 27 47 49 2 2 21 60 6 42 24 66 55 11 36 6 41 1 42 108 5 105 42 9 9 8 9 9 1 9 444 444 5 443 1 563 565 5 410 208 446 445 1 3 1 2 4 1 1 2 3 657 14 646 706 685 675 657 655 92 173 54 119 163 56 1 5 859 3 856 8 176 841 599 146 580 577 153 3 12 10 7 2 3 474 374 101 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { if (simu_fail) { clear_buffer_uptodate(bh); unlock_buffer(bh); return; } /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } return ext4_read_bh(bh, op_flags, NULL, false); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL; return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct ext4_super_block *es) { int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_feature_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last * superblock update * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the * last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); } static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, struct folio *folio) { struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != jinode->i_transaction || jh->b_next_transaction))) return true; } while ((bh = bh->b_this_page) != head); return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct folio *folio = NULL; int error; /* * writeback_iter() already checks for dirty pages and calls * folio_clear_dirty_for_io(), which we want to write protect the * folios. * * However, we may have to redirty a folio sometimes. */ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { if (ext4_journalled_writepage_needs_redirty(jinode, folio)) folio_redirty_for_writepage(&wbc, folio); folio_unlock(folio); } return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !ext4_emergency_ro(sb) && journal) jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. We just need to be * careful when the journal is already shutting down. If we get * here in that case, just update the sb directly as the last * transaction won't commit anyway. */ if (continue_fs && journal && !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * We don't set SB_RDONLY because that requires sb->s_umount * semaphore and setting it without proper remount procedure is * confusing code such as freeze_super() leading to deadlocks * and other problems. */ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!ext4_emergency_state(sbi->s_sb) && !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } else flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { if (!aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } ext4_commit_super(sb); } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && percpu_counter_sum(&sbi->s_dirtyclusters_counter)); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); #ifdef CONFIG_FS_ENCRYPTION ei->i_crypt_info = NULL; #endif #ifdef CONFIG_FS_VERITY ei->i_verity_info = NULL; #endif } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot __rcu **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_gid ("resgid", Opt_resgid), fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; /* i_version is always enabled now */ fc->sb_flags |= SB_I_VERSION; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline __maybe_unused \ void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline __maybe_unused \ void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, ¶m); kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char s_mount_opts[65]; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) return -ENOMEM; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: ext4_fc_free(fc); kfree(fc); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (!sbi->s_journal && ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { ext4_msg(NULL, KERN_WARNING, "Remounting fs w/o journal so ignoring data_err option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || ext4_get_resuid(es) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || ext4_get_resgid(es) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs && sb->s_flags & SB_I_VERSION) SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); if (ext4_emergency_ro(sb)) SEQ_OPTS_PUTS("emergency_ro"); if (ext4_forced_shutdown(sb)) SEQ_OPTS_PUTS("shutdown"); ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_putc(seq, '\n'); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !ext4_emergency_state(sb) && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { bool next_wakeup_initialized = false; next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_cluster_bits = 0; } sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. * With non-bigalloc filesystem awu will be based upon filesystem blocksize * & bdev awu units. * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; if (!ext4_has_feature_extents(sb)) return; sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", sbi->s_awu_min, sbi->s_awu_max); } else { sbi->s_awu_min = 0; sbi->s_awu_max = 0; } } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static const char *ext4_has_journal_option(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) return "journal_async_commit"; if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) return "journal_checksum"; if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) return "commit="; if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) return "data="; if (test_opt(sb, DATA_ERR_ABORT)) return "data_err=abort"; return NULL; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_def_hash_version > DX_HASH_LAST) { ext4_msg(sb, KERN_ERR, "Invalid default hash set in the superblock"); return -EINVAL; } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { ext4_msg(sb, KERN_ERR, "SIPHASH is not a valid default hash value"); return -EINVAL; } for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simplify code and avoid * stripe aligned allocation which will rarely succeed. */ static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (stripe > 0 && sbi->s_cluster_ratio > 1 && stripe % sbi->s_cluster_ratio != 0); } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es)); sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* HSM events are allowed by default. */ sb->s_iflags |= SB_I_ALLOW_HSM; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; err = ext4_hash_info_init(sb); if (err) goto failed_mount; err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); spin_lock_init(&sbi->s_bdev_wb_lock); ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; if (bdev_read_only(sb->s_bdev)) needs_recovery = 0; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { const char *journal_option; /* Nojournal mode, all journal mount options are illegal */ journal_option = ext4_has_journal_option(sb); if (journal_option != NULL) { ext4_msg(sb, KERN_ERR, "can't mount with %s, fs mounted w/o journal", journal_option); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_init_orphan_info(sb); if (err) goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); clear_opt(sb, DISCARD); } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); /* Register sysfs after all initializations are complete. */ err = ext4_register_sysfs(sb); if (err) goto failed_mount9; return 0; failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount8: __maybe_unused ext4_release_orphan_info(sb); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; } static int ext4_journal_bmap(journal_t *journal, sector_t *block) { struct ext4_map_blocks map; int ret; if (journal->j_inode == NULL) return 0; map.m_lblk = *block; map.m_len = 1; ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); if (ret <= 0) { ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); jbd2_journal_abort(journal, ret ? ret : -EIO); return ret; } *block = map.m_pblk; return 0; } static journal_t *ext4_open_inode_journal(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (IS_ERR(journal_inode)) return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); return bdev_file; } bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); errno = -EINVAL; goto out_bdev; } es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); errno = -EFSCORRUPTED; goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); errno = -EFSCORRUPTED; goto out_bh; } *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); return bdev_file; out_bh: brelse(bh); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static journal_t *ext4_open_dev_journal(struct super_block *sb, dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct file *bdev_file; int errno = 0; bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); errno = -EINVAL; goto out_journal; } journal->j_private = sb; EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, struct ext4_super_block *es, unsigned long journal_devnum) { journal_t *journal; unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { ext4_msg(sb, KERN_INFO, "external journal device major/minor " "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); if (journal_inum && journal_dev) { ext4_msg(sb, KERN_ERR, "filesystem has both journal inode and journal device!"); return -EINVAL; } if (journal_inum) { journal = ext4_open_inode_journal(sb, journal_inum); if (IS_ERR(journal)) return PTR_ERR(journal); } else { journal = ext4_open_dev_journal(sb, journal_dev); if (IS_ERR(journal)) return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; if (journal_dev_ro && !sb_rdonly(sb)) { ext4_msg(sb, KERN_ERR, "journal device read-only, try mounting with '-o ro'"); err = -EROFS; goto err_out; } /* * Are we loading a blank journal or performing recovery after a * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); err = -EROFS; goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); __le16 orig_state; bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); if (save && memcmp(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); changed = true; } kfree(save); orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); if (orig_state != es->s_state) changed = true; /* Write out restored error information to the superblock */ if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; } } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); goto err_out; } EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { ext4_journal_destroy(EXT4_SB(sb), journal); return err; } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); ext4_commit_super(sb); } if (!really_read_only && journal_inum && journal_inum != le32_to_cpu(es->s_journal_inum)) { es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } return 0; err_out: ext4_journal_destroy(EXT4_SB(sb), journal); return err; } /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system * read/only but we need to replay the journal; at that point, * for people who are east of GMT and who make their clock * tick in localtime for Windows bug-for-bug compatibility, * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(sbi, percpu_counter_sum_positive( &sbi->s_freeclusters_counter))); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { es->s_state |= cpu_to_le16(EXT4_ERROR_FS); if (!es->s_first_error_time && !es->s_first_error_time_hi) { __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); strtomem_pad(es->s_first_error_func, sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = cpu_to_le32(sbi->s_first_error_ino); es->s_first_error_block = cpu_to_le64(sbi->s_first_error_block); es->s_first_error_errcode = ext4_errno_to_code(sbi->s_first_error_code); } __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); es->s_last_error_errcode = ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been * started already */ if (!es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } spin_unlock(&sbi->s_error_lock); ext4_superblock_csum_set(sb); unlock_buffer(sbh); } static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return -EINVAL; ext4_update_super(sb); lock_buffer(sbh); /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(sbh)) { unlock_buffer(sbh); return -EIO; } if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ ext4_msg(sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } get_bh(sbh); /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); return -EIO; } return 0; } /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es) { int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { if (journal != NULL) { ext4_error(sb, "Journal got removed while the fs was " "mounted!"); return -EFSCORRUPTED; } return 0; } jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || ext4_has_feature_orphan_present(sb))) { if (!ext4_orphan_file_empty(sb)) { ext4_error(sb, "Orphan file not empty on read-only fs."); err = -EFSCORRUPTED; goto out; } ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); return err; } /* * If we are mounting (or read-write remounting) a filesystem whose journal * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; if (!ext4_has_feature_journal(sb)) { ext4_error(sb, "Journal got removed while the fs was mounted!"); return -EFSCORRUPTED; } journal = EXT4_SB(sb)->s_journal; /* * Now check for any error status which may have been recorded in the * journal by a prior ext4_error() or ext4_abort() */ j_errno = jbd2_journal_errno(journal); if (j_errno) { char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); j_errno = ext4_commit_super(sb); if (j_errno) return j_errno; ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } return 0; } /* * Force the running and committing transactions to commit, * and wait on the commit. */ int ext4_force_commit(struct super_block *sb) { return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); /* * Data writeback is possible w/o journal transaction, so barrier must * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ if (sbi->s_journal) { target = jbd2_get_latest_transaction(sbi->s_journal); if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) needs_barrier = true; if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) ret = jbd2_log_wait_commit(sbi->s_journal, target); } } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } return ret; } /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean * state independently. It relies on upper layer to stop all data & metadata * modifications. */ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); /* * Don't clear the needs_recovery flag if we failed to * flush the journal. */ error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(journal); return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ static int ext4_unfreeze(struct super_block *sb) { if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); return 0; } /* * Structure to save mount options for ext4_remount's benefit */ struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; kuid_t s_resuid; kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_min_batch_time = sbi->s_min_batch_time; old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { char *qf_name = get_qf_name(sb, sbi, i); old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { if (sbi->s_journal && sbi->s_journal->j_task->io_context) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } if ((ctx->spec & EXT4_SPEC_s_stripe) && ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", ctx->s_stripe, sbi->s_cluster_ratio); ctx->s_stripe = 0; } /* * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * two calls to ext4_should_dioread_nolock() to return inconsistent * values, triggering WARN_ON in ext4_add_complete_io(). we grab * here s_writepages_rwsem to avoid race between writepages ops and * remount. */ alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " "during remount not supported; ignoring"); sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); err = -EINVAL; goto restore_opts; } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dioread_nolock"); err = -EINVAL; goto restore_opts; } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); err = -EINVAL; goto restore_opts; } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); err = -EINVAL; goto restore_opts; } if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && !test_opt(sb, DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition * readonly, and if so set the rdonly flag and then * mark the partition as valid again. */ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); if (sbi->s_journal) { /* * We let remount-ro finish even if marking fs * as clean failed... */ ext4_mark_recovery_complete(sb, es); } } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EFSBADCRC; goto restore_opts; } } /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. */ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " "umount/remount instead"); err = -EINVAL; goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ if (sbi->s_journal) { err = ext4_clear_journal_err(sb, es); if (err) goto restore_opts; } sbi->s_mount_state = (le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA enable_quota = 1; #endif } } /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will * succeed. */ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; } } /* Release old quota file names */ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on * current settings */ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; first_not_zeroed = ext4_has_uninit_itable(sb); ext4_register_li_request(sb, first_not_zeroed); } if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); /* * Handle aborting the filesystem as the last thing during remount to * avoid obsure errors during remount when some option changes fail to * apply due to shutdown filesystem. */ if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); return 0; restore_opts: /* * If there was a failing r/w to ro transition, we may need to * re-enable quota */ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; ext4_writepages_up_write(sb, alloc_ctx); if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { to_free[i] = get_qf_name(sb, sbi, i); rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } synchronize_rcu(); for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return err; } static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) return ret; ret = __ext4_remount(fc, sb); if (ret < 0) return ret; ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", &sb->s_uuid, (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit) { uint64_t remaining = 0; curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; if (limit > curblock) remaining = limit - curblock; buf->f_blocks = min(buf->f_blocks, limit); buf->f_bfree = min(buf->f_bfree, remaining); buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit) { uint64_t remaining = 0; if (limit > dquot->dq_dqb.dqb_curinodes) remaining = limit - dquot->dq_dqb.dqb_curinodes; buf->f_files = min(buf->f_files, limit); buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - (ext4_r_blocks_count(es) + resv_blocks); if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif return 0; } #ifdef CONFIG_QUOTA /* * Helper functions so that transaction is started before we acquire dqio_sem * to keep correct lock ordering of transaction > dqio_sem */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; struct inode *inode; inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to commit dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_acquire_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to acquire dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; bool freeze_protected = false; /* * Trying to sb_start_intwrite() in a running transaction * can result in a deadlock. Further, running transactions * are already protected from freezing. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(dquot->dq_sb); freeze_protected = true; } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); if (freeze_protected) sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to release dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; if (freeze_protected) sb_end_intwrite(dquot->dq_sb); return ret; } static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { return dquot_mark_dquot_dirty(dquot); } } static int ext4_write_info(struct super_block *sb, int type) { int ret, err; handle_t *handle; /* Data block + inode block */ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); /* The first argument of lockdep_set_subclass has to be * *exactly* the same as the argument to init_rwsem() --- in * this case, in init_once() --- or lockdep gets unhappy * because the name of the lock is set using the * stringification of the argument to init_rwsem(). */ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, subclass); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; if (!test_opt(sb, QUOTA)) return -EINVAL; /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; /* Quota already enabled for this file? */ if (IS_NOQUOTA(d_inode(path->dentry))) return -EBUSY; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; } else { /* * Clear the flag just in case mount options changed since * last time. */ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; /* * Set inode flags to prevent userspace from messing with quota * files. If this fails, we return success anyway since quotas * are already enabled and this is not a hard failure. */ inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto unlock_inode; EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); if (err) dquot_quota_off(sb, type); } if (err) lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; } static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) { switch (type) { case USRQUOTA: return qf_inum == EXT4_USR_QUOTA_INO; case GRPQUOTA: return qf_inum == EXT4_GRP_QUOTA_INO; case PRJQUOTA: return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; default: BUG(); } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { int err; struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; if (!ext4_check_quota_inum(type, qf_inums[type])) { ext4_error(sb, "Bad quota inum: %lu, type: %d", qf_inums[type], type); return -EUCLEAN; } qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode: %lu, type: %d", qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); return err; } /* Enable usage tracking for all quota types. */ int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; bool quota_mopt[EXT4_MAXQUOTAS] = { test_opt(sb, USRQUOTA), test_opt(sb, GRPQUOTA), test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); ext4_quotas_off(sb, type); return err; } } } return 0; } static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; /* * When the filesystem was remounted read-only first, we cannot cleanup * inode flags here. Bad luck but people should be using QUOTA feature * these days anyway. */ if (sb_rdonly(sb)) goto out_put; inode_lock(inode); /* * Update modification times of quota files when userspace can * start looking at them. If we fail, we return success anyway since * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_unlock; } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); out_put: lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: return dquot_quota_off(sb, type); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else memcpy(data, bh->b_data+offset, tocopy); brelse(bh); offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile (we know the transaction is already started and has * enough credits) */ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } /* * Since we account only one data block in transaction credits, * then it is impossible to cross a block boundary. */ if (sb->s_blocksize - offset < len) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because not block aligned", (unsigned long long)off, (unsigned long long)len); return -EIO; } do { bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; } return err ? err : len; } #endif #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext2 (%d)\n", err); } static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext3 (%d)\n", err); } static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } static inline int ext3_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; if (!ext4_has_feature_journal(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); if (bdev_file) bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); static int __init ext4_init_fs(void) { int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; /* Build-time check for flags consistency */ ext4_check_flag_values(); err = ext4_init_es(); if (err) return err; err = ext4_init_pending(); if (err) goto out7; err = ext4_init_post_read_processing(); if (err) goto out6; err = ext4_init_pageio(); if (err) goto out5; err = ext4_init_system_zone(); if (err) goto out4; err = ext4_init_sysfs(); if (err) goto out3; err = ext4_init_mballoc(); if (err) goto out2; err = init_inodecache(); if (err) goto out1; err = ext4_fc_init_dentry_cache(); if (err) goto out05; register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: unregister_as_ext2(); unregister_as_ext3(); ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: ext4_exit_mballoc(); out2: ext4_exit_sysfs(); out3: ext4_exit_system_zone(); out4: ext4_exit_pageio(); out5: ext4_exit_post_read_processing(); out6: ext4_exit_pending(); out7: ext4_exit_es(); return err; } static void __exit ext4_exit_fs(void) { ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) |
| 4 93 166 3 30 133 180 3 173 4 4 174 171 56 116 2 4 1 1 2311 2284 29 1914 1889 26 389 378 1 6 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lwtunnel Infrastructure for light weight tunnels like mpls * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/lwtunnel.h> #include <linux/in.h> #include <linux/init.h> #include <linux/err.h> #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> #include <net/rtnh.h> #include "dev.h" DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) { /* Only lwt encaps implemented without using an interface for * the encap need to return a string here. */ switch (encap_type) { case LWTUNNEL_ENCAP_MPLS: return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; case LWTUNNEL_ENCAP_XFRM: /* module autoload not supported for encap type */ return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ WARN_ON(1); break; } return NULL; } #endif /* CONFIG_MODULES */ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); return lws; } EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, unsigned int num) { if (num > LWTUNNEL_ENCAP_MAX) return -ERANGE; return !cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) { int ret; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) return -ERANGE; ret = (cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[encap_type], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG_ATTR(extack, encap, "Unknown LWT encapsulation type"); return ret; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; rcu_read_unlock(); if (found) { ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { /* don't rely on -EOPNOTSUPP to detect match as build_state * handlers could return it */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } return ret; } EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; } ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { request_module("rtnl-lwt-%s", encap_type_str); ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif ret = ops ? 0 : -EOPNOTSUPP; if (ret < 0) NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; u16 encap_type; int attrlen; while (rtnh_ok(rtnh, remaining)) { attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { if (nla_len(nla_entype) < sizeof(u16)) { NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); return -EINVAL; } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } rtnh = rtnh_next(rtnh, &remaining); } return 0; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; if (ops->destroy_state) { ops->destroy_state(lws); kfree_rcu(lws, rcu); } else { kfree(lws); } module_put(ops->owner); } EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; int ret; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) ret = ops->fill_encap(skb, lwtstate); rcu_read_unlock(); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; return 0; nla_put_failure: nla_nest_cancel(skb, nest); return (ret == -EOPNOTSUPP ? 0 : ret); } EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->get_encap_size)) ret = nla_total_size(ops->get_encap_size(lwtstate)); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!a && !b) return 0; if (!a || !b) return 1; if (a->type != b->type) return 1; if (a->type == LWTUNNEL_ENCAP_NONE || a->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[a->type]); if (likely(ops && ops->cmp_encap)) ret = ops->cmp_encap(a, b); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; local_bh_disable(); if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) { ret = 0; goto out; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) { dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; goto out; drop: kfree_skb(skb); out: local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; local_bh_disable(); if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) { ret = 0; goto out; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) { dev_xmit_recursion_inc(); ret = ops->xmit(skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; goto out; drop: kfree_skb(skb); out: local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; struct dst_entry *dst; int ret; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); ret = -ENETDOWN; goto drop; } dst = skb_dst(skb); if (!dst) { ret = -EINVAL; goto drop; } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) { dev_xmit_recursion_inc(); ret = ops->input(skb); dev_xmit_recursion_dec(); } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_input); |
| 1980 270 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/percpu_counter.h> #include <linux/atomic.h> #include <uapi/linux/mman.h> /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 #endif #ifndef MAP_ABOVE4G #define MAP_ABOVE4G 0 #endif #ifndef MAP_HUGE_2MB #define MAP_HUGE_2MB 0 #endif #ifndef MAP_HUGE_1GB #define MAP_HUGE_1GB 0 #endif #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif #ifndef MAP_SYNC #define MAP_SYNC 0 #endif /* * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ | MAP_FIXED \ | MAP_ANONYMOUS \ | MAP_DENYWRITE \ | MAP_EXECUTABLE \ | MAP_UNINITIALIZED \ | MAP_GROWSDOWN \ | MAP_LOCKED \ | MAP_NORESERVE \ | MAP_POPULATE \ | MAP_NONBLOCK \ | MAP_STACK \ | MAP_HUGETLB \ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 static inline void mm_compute_batch(int overcommit_policy) { } #endif unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) { vm_acct_memory(-pages); } /* * Allow architectures to handle additional protection and flag bits. The * overriding macros must be defined in the arch-specific asm/mman.h file. */ #ifndef arch_calc_vm_prot_bits #define arch_calc_vm_prot_bits(prot, pkey) 0 #endif #ifndef arch_calc_vm_flag_bits #define arch_calc_vm_flag_bits(file, flags) 0 #endif #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have * already been masked out. * * Returns true if the prot flags are valid */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; } #define arch_validate_prot arch_validate_prot #endif #ifndef arch_validate_flags /* * This is called from mmap() and mprotect() with the updated vma->vm_flags. * * Returns true if the VM_* flags are valid. */ static inline bool arch_validate_flags(unsigned long flags) { return true; } #define arch_validate_flags arch_validate_flags #endif /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 * but this version is faster. * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. */ static inline vm_flags_t calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | arch_calc_vm_prot_bits(prot, pkey); } /* * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline vm_flags_t calc_vm_flag_bits(struct file *file, unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | #ifdef CONFIG_TRANSPARENT_HUGEPAGE _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | #endif arch_calc_vm_flag_bits(file, flags); } unsigned long vm_commit_limit(void); #ifndef arch_memory_deny_write_exec_supported static inline bool arch_memory_deny_write_exec_supported(void) { return true; } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif /* * Denies creating a writable executable mapping or gaining executable permissions. * * This denies the following: * * a) mmap(PROT_WRITE | PROT_EXEC) * * b) mmap(PROT_WRITE) * mprotect(PROT_EXEC) * * c) mmap(PROT_WRITE) * mprotect(PROT_READ) * mprotect(PROT_EXEC) * * But allows the following: * * d) mmap(PROT_READ | PROT_EXEC) * mmap(PROT_READ | PROT_EXEC | PROT_BTI) * * This is only applicable if the user has set the Memory-Deny-Write-Execute * (MDWE) protection mask for the current process. * * @old specifies the VMA flags the VMA originally possessed, and @new the ones * we propose to set. * * Return: false if proposed change is OK, true if not ok and should be denied. */ static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ if (!(new & VM_EXEC)) return false; /* Under MDWE we do not accept newly writably executable VMAs... */ if (new & VM_WRITE) return true; /* ...nor previously non-executable VMAs becoming executable. */ if (!(old & VM_EXEC)) return true; return false; } #endif /* _LINUX_MMAN_H */ |
| 1452 29 220 2150 3633 3633 2150 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_LWTUNNEL_H #define __NET_LWTUNNEL_H 1 #include <linux/lwtunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/route.h> #define LWTUNNEL_HASH_BITS 7 #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) /* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. */ enum { LWTUNNEL_XMIT_DONE, LWTUNNEL_XMIT_CONTINUE = 0x100, }; struct lwtunnel_state { __u16 type; __u16 flags; __u16 headroom; atomic_t refcnt; int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); struct rcu_head rcu; __u8 data[]; }; struct lwtunnel_encap_ops { int (*build_state)(struct net *net, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack); void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); int (*xmit)(struct sk_buff *skb); struct module *owner; }; #ifdef CONFIG_LWTUNNEL DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { if (lws) atomic_inc(&lws->refcnt); return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; if (atomic_dec_and_test(&lws->refcnt)) lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) return true; return false; } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { if ((lwtunnel_xmit_redirect(lwtstate) || lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu) return lwtstate->headroom; return 0; } int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { dst->lwtstate->orig_output = READ_ONCE(dst->output); WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { dst->lwtstate->orig_input = READ_ONCE(dst->input); WRITE_ONCE(dst->input, lwtunnel_input); } } #else static inline void lwtstate_free(struct lwtunnel_state *lws) { } static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline void lwtunnel_set_redirect(struct dst_entry *dst) { } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { return 0; } static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. */ return 0; } static inline int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { return 0; } static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { return 0; } static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) { return NULL; } static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { return 0; } static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_xmit(struct sk_buff *skb) { return -EOPNOTSUPP; } #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */ |
| 196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel reference Implementation * * Various protocol defined structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * randall@sctp.chicago.il.us * kmorneau@cisco.com * qxie1@email.mot.com * Sridhar Samudrala <sri@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> * * Any bugs reported given to us we will try to fix... any fixes shared will * be incorporated into the next SCTP release. */ #ifndef __LINUX_SCTP_H__ #define __LINUX_SCTP_H__ #include <linux/in.h> /* We need in_addr. */ #include <linux/in6.h> /* We need in6_addr. */ #include <linux/skbuff.h> #include <uapi/linux/sctp.h> /* Section 3.1. SCTP Common Header Format */ struct sctphdr { __be16 source; __be16 dest; __be32 vtag; __le32 checksum; }; static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb) { return (struct sctphdr *)skb_transport_header(skb); } /* Section 3.2. Chunk Field Descriptions. */ struct sctp_chunkhdr { __u8 type; __u8 flags; __be16 length; }; /* Section 3.2. Chunk Type Values. * [Chunk Type] identifies the type of information contained in the Chunk * Value field. It takes a value from 0 to 254. The value of 255 is * reserved for future use as an extension field. */ enum sctp_cid { SCTP_CID_DATA = 0, SCTP_CID_INIT = 1, SCTP_CID_INIT_ACK = 2, SCTP_CID_SACK = 3, SCTP_CID_HEARTBEAT = 4, SCTP_CID_HEARTBEAT_ACK = 5, SCTP_CID_ABORT = 6, SCTP_CID_SHUTDOWN = 7, SCTP_CID_SHUTDOWN_ACK = 8, SCTP_CID_ERROR = 9, SCTP_CID_COOKIE_ECHO = 10, SCTP_CID_COOKIE_ACK = 11, SCTP_CID_ECN_ECNE = 12, SCTP_CID_ECN_CWR = 13, SCTP_CID_SHUTDOWN_COMPLETE = 14, /* AUTH Extension Section 4.1 */ SCTP_CID_AUTH = 0x0F, /* sctp ndata 5.1. I-DATA */ SCTP_CID_I_DATA = 0x40, /* PR-SCTP Sec 3.2 */ SCTP_CID_FWD_TSN = 0xC0, /* Use hex, as defined in ADDIP sec. 3.1 */ SCTP_CID_ASCONF = 0xC1, SCTP_CID_I_FWD_TSN = 0xC2, SCTP_CID_ASCONF_ACK = 0x80, SCTP_CID_RECONF = 0x82, SCTP_CID_PAD = 0x84, }; /* enum */ /* Section 3.2 * Chunk Types are encoded such that the highest-order two bits specify * the action that must be taken if the processing endpoint does not * recognize the Chunk Type. */ enum { SCTP_CID_ACTION_DISCARD = 0x00, SCTP_CID_ACTION_DISCARD_ERR = 0x40, SCTP_CID_ACTION_SKIP = 0x80, SCTP_CID_ACTION_SKIP_ERR = 0xc0, }; enum { SCTP_CID_ACTION_MASK = 0xc0, }; /* This flag is used in Chunk Flags for ABORT and SHUTDOWN COMPLETE. * * 3.3.7 Abort Association (ABORT) (6): * The T bit is set to 0 if the sender had a TCB that it destroyed. * If the sender did not have a TCB it should set this bit to 1. */ enum { SCTP_CHUNK_FLAG_T = 0x01 }; /* * Set the T bit * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 14 |Reserved |T| Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Reserved: 7 bits * Set to 0 on transmit and ignored on receipt. * * T bit: 1 bit * The T bit is set to 0 if the sender had a TCB that it destroyed. If * the sender did NOT have a TCB it should set this bit to 1. * * Note: Special rules apply to this chunk for verification, please * see Section 8.5.1 for details. */ #define sctp_test_T_bit(c) ((c)->chunk_hdr->flags & SCTP_CHUNK_FLAG_T) /* RFC 2960 * Section 3.2.1 Optional/Variable-length Parmaeter Format. */ struct sctp_paramhdr { __be16 type; __be16 length; }; enum sctp_param { /* RFC 2960 Section 3.3.5 */ SCTP_PARAM_HEARTBEAT_INFO = cpu_to_be16(1), /* RFC 2960 Section 3.3.2.1 */ SCTP_PARAM_IPV4_ADDRESS = cpu_to_be16(5), SCTP_PARAM_IPV6_ADDRESS = cpu_to_be16(6), SCTP_PARAM_STATE_COOKIE = cpu_to_be16(7), SCTP_PARAM_UNRECOGNIZED_PARAMETERS = cpu_to_be16(8), SCTP_PARAM_COOKIE_PRESERVATIVE = cpu_to_be16(9), SCTP_PARAM_HOST_NAME_ADDRESS = cpu_to_be16(11), SCTP_PARAM_SUPPORTED_ADDRESS_TYPES = cpu_to_be16(12), SCTP_PARAM_ECN_CAPABLE = cpu_to_be16(0x8000), /* AUTH Extension Section 3 */ SCTP_PARAM_RANDOM = cpu_to_be16(0x8002), SCTP_PARAM_CHUNKS = cpu_to_be16(0x8003), SCTP_PARAM_HMAC_ALGO = cpu_to_be16(0x8004), /* Add-IP: Supported Extensions, Section 4.2 */ SCTP_PARAM_SUPPORTED_EXT = cpu_to_be16(0x8008), /* PR-SCTP Sec 3.1 */ SCTP_PARAM_FWD_TSN_SUPPORT = cpu_to_be16(0xc000), /* Add-IP Extension. Section 3.2 */ SCTP_PARAM_ADD_IP = cpu_to_be16(0xc001), SCTP_PARAM_DEL_IP = cpu_to_be16(0xc002), SCTP_PARAM_ERR_CAUSE = cpu_to_be16(0xc003), SCTP_PARAM_SET_PRIMARY = cpu_to_be16(0xc004), SCTP_PARAM_SUCCESS_REPORT = cpu_to_be16(0xc005), SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006), /* RE-CONFIG. Section 4 */ SCTP_PARAM_RESET_OUT_REQUEST = cpu_to_be16(0x000d), SCTP_PARAM_RESET_IN_REQUEST = cpu_to_be16(0x000e), SCTP_PARAM_RESET_TSN_REQUEST = cpu_to_be16(0x000f), SCTP_PARAM_RESET_RESPONSE = cpu_to_be16(0x0010), SCTP_PARAM_RESET_ADD_OUT_STREAMS = cpu_to_be16(0x0011), SCTP_PARAM_RESET_ADD_IN_STREAMS = cpu_to_be16(0x0012), }; /* enum */ /* RFC 2960 Section 3.2.1 * The Parameter Types are encoded such that the highest-order two bits * specify the action that must be taken if the processing endpoint does * not recognize the Parameter Type. * */ enum { SCTP_PARAM_ACTION_DISCARD = cpu_to_be16(0x0000), SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000), SCTP_PARAM_ACTION_SKIP = cpu_to_be16(0x8000), SCTP_PARAM_ACTION_SKIP_ERR = cpu_to_be16(0xc000), }; enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), }; /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; __u32 ppid; }; struct sctp_data_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_datahdr data_hdr; }; struct sctp_idatahdr { __be32 tsn; __be16 stream; __be16 reserved; __be32 mid; union { __u32 ppid; __be32 fsn; }; }; struct sctp_idata_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_idatahdr data_hdr; }; /* DATA Chuck Specific Flags */ enum { SCTP_DATA_MIDDLE_FRAG = 0x00, SCTP_DATA_LAST_FRAG = 0x01, SCTP_DATA_FIRST_FRAG = 0x02, SCTP_DATA_NOT_FRAG = 0x03, SCTP_DATA_UNORDERED = 0x04, SCTP_DATA_SACK_IMM = 0x08, }; enum { SCTP_DATA_FRAG_MASK = 0x03, }; /* RFC 2960 Section 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. */ struct sctp_inithdr { __be32 init_tag; __be32 a_rwnd; __be16 num_outbound_streams; __be16 num_inbound_streams; __be32 initial_tsn; /* __u8 params[]; */ }; struct sctp_init_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.2.1. IPv4 Address Parameter (5) */ struct sctp_ipv4addr_param { struct sctp_paramhdr param_hdr; struct in_addr addr; }; /* Section 3.3.2.1. IPv6 Address Parameter (6) */ struct sctp_ipv6addr_param { struct sctp_paramhdr param_hdr; struct in6_addr addr; }; /* Section 3.3.2.1 Cookie Preservative (9) */ struct sctp_cookie_preserve_param { struct sctp_paramhdr param_hdr; __be32 lifespan_increment; }; /* Section 3.3.2.1 Host Name Address (11) */ struct sctp_hostname_param { struct sctp_paramhdr param_hdr; uint8_t hostname[]; }; /* Section 3.3.2.1 Supported Address Types (12) */ struct sctp_supported_addrs_param { struct sctp_paramhdr param_hdr; __be16 types[]; }; /* ADDIP Section 3.2.6 Adaptation Layer Indication */ struct sctp_adaptation_ind_param { struct sctp_paramhdr param_hdr; __be32 adaptation_ind; }; /* ADDIP Section 4.2.7 Supported Extensions Parameter */ struct sctp_supported_ext_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.1 Random */ struct sctp_random_param { struct sctp_paramhdr param_hdr; __u8 random_val[]; }; /* AUTH Section 3.2 Chunk List */ struct sctp_chunks_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.3 HMAC Algorithm */ struct sctp_hmac_algo_param { struct sctp_paramhdr param_hdr; __be16 hmac_ids[]; }; /* RFC 2960. Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2): * The INIT ACK chunk is used to acknowledge the initiation of an SCTP * association. */ struct sctp_initack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.3.1 State Cookie (7) */ struct sctp_cookie_param { struct sctp_paramhdr p; __u8 body[]; }; /* Section 3.3.3.1 Unrecognized Parameters (8) */ struct sctp_unrecognized_param { struct sctp_paramhdr param_hdr; struct sctp_paramhdr unrecognized; }; /* * 3.3.4 Selective Acknowledgement (SACK) (3): * * This chunk is sent to the peer endpoint to acknowledge received DATA * chunks and to inform the peer endpoint of gaps in the received * subsequences of DATA chunks as represented by their TSNs. */ struct sctp_gap_ack_block { __be16 start; __be16 end; }; union sctp_sack_variable { struct sctp_gap_ack_block gab; __be32 dup; }; struct sctp_sackhdr { __be32 cum_tsn_ack; __be32 a_rwnd; __be16 num_gap_ack_blocks; __be16 num_dup_tsns; /* union sctp_sack_variable variable[]; */ }; struct sctp_sack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_sackhdr sack_hdr; }; /* RFC 2960. Section 3.3.5 Heartbeat Request (HEARTBEAT) (4): * * An endpoint should send this chunk to its peer endpoint to probe the * reachability of a particular destination transport address defined in * the present association. */ struct sctp_heartbeathdr { struct sctp_paramhdr info; }; struct sctp_heartbeat_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_heartbeathdr hb_hdr; }; /* PAD chunk could be bundled with heartbeat chunk to probe pmtu */ struct sctp_pad_chunk { struct sctp_chunkhdr uh; }; /* For the abort and shutdown ACK we must carry the init tag in the * common header. Just the common header is all that is needed with a * chunk descriptor. */ struct sctp_abort_chunk { struct sctp_chunkhdr uh; }; /* For the graceful shutdown we must carry the tag (in common header) * and the highest consecutive acking value. */ struct sctp_shutdownhdr { __be32 cum_tsn_ack; }; struct sctp_shutdown_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_shutdownhdr shutdown_hdr; }; /* RFC 2960. Section 3.3.10 Operation Error (ERROR) (9) */ struct sctp_errhdr { __be16 cause; __be16 length; /* __u8 variable[]; */ }; struct sctp_operr_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_errhdr err_hdr; }; /* RFC 2960 3.3.10 - Operation Error * * Cause Code: 16 bits (unsigned integer) * * Defines the type of error conditions being reported. * Cause Code * Value Cause Code * --------- ---------------- * 1 Invalid Stream Identifier * 2 Missing Mandatory Parameter * 3 Stale Cookie Error * 4 Out of Resource * 5 Unresolvable Address * 6 Unrecognized Chunk Type * 7 Invalid Mandatory Parameter * 8 Unrecognized Parameters * 9 No User Data * 10 Cookie Received While Shutting Down */ enum sctp_error { SCTP_ERROR_NO_ERROR = cpu_to_be16(0x00), SCTP_ERROR_INV_STRM = cpu_to_be16(0x01), SCTP_ERROR_MISS_PARAM = cpu_to_be16(0x02), SCTP_ERROR_STALE_COOKIE = cpu_to_be16(0x03), SCTP_ERROR_NO_RESOURCE = cpu_to_be16(0x04), SCTP_ERROR_DNS_FAILED = cpu_to_be16(0x05), SCTP_ERROR_UNKNOWN_CHUNK = cpu_to_be16(0x06), SCTP_ERROR_INV_PARAM = cpu_to_be16(0x07), SCTP_ERROR_UNKNOWN_PARAM = cpu_to_be16(0x08), SCTP_ERROR_NO_DATA = cpu_to_be16(0x09), SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a), /* SCTP Implementation Guide: * 11 Restart of an association with new addresses * 12 User Initiated Abort * 13 Protocol Violation * 14 Restart of an Association with New Encapsulation Port */ SCTP_ERROR_RESTART = cpu_to_be16(0x0b), SCTP_ERROR_USER_ABORT = cpu_to_be16(0x0c), SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d), SCTP_ERROR_NEW_ENCAP_PORT = cpu_to_be16(0x0e), /* ADDIP Section 3.3 New Error Causes * * Four new Error Causes are added to the SCTP Operational Errors, * primarily for use in the ASCONF-ACK chunk. * * Value Cause Code * --------- ---------------- * 0x00A0 Request to Delete Last Remaining IP Address. * 0x00A1 Operation Refused Due to Resource Shortage. * 0x00A2 Request to Delete Source IP Address. * 0x00A3 Association Aborted due to illegal ASCONF-ACK * 0x00A4 Request refused - no authorization. */ SCTP_ERROR_DEL_LAST_IP = cpu_to_be16(0x00A0), SCTP_ERROR_RSRC_LOW = cpu_to_be16(0x00A1), SCTP_ERROR_DEL_SRC_IP = cpu_to_be16(0x00A2), SCTP_ERROR_ASCONF_ACK = cpu_to_be16(0x00A3), SCTP_ERROR_REQ_REFUSED = cpu_to_be16(0x00A4), /* AUTH Section 4. New Error Cause * * This section defines a new error cause that will be sent if an AUTH * chunk is received with an unsupported HMAC identifier. * illustrates the new error cause. * * Cause Code Error Cause Name * -------------------------------------------------------------- * 0x0105 Unsupported HMAC Identifier */ SCTP_ERROR_UNSUP_HMAC = cpu_to_be16(0x0105) }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Explicit Congestion Notification Echo (ECNE) (12) */ struct sctp_ecnehdr { __be32 lowest_tsn; }; struct sctp_ecne_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ecnehdr ence_hdr; }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Congestion Window Reduced (CWR) (13) */ struct sctp_cwrhdr { __be32 lowest_tsn; }; /* PR-SCTP * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN) * * Forward Cumulative TSN chunk has the following format: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 192 | Flags = 0x00 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | New Cumulative TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-1 | Stream Sequence-1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-N | Stream Sequence-N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: * * Set to all zeros on transmit and ignored on receipt. * * New Cumulative TSN: 32 bit u_int * * This indicates the new cumulative TSN to the data receiver. Upon * the reception of this value, the data receiver MUST consider * any missing TSNs earlier than or equal to this value as received * and stop reporting them as gaps in any subsequent SACKs. * * Stream-N: 16 bit u_int * * This field holds a stream number that was skipped by this * FWD-TSN. * * Stream Sequence-N: 16 bit u_int * This field holds the sequence number associated with the stream * that was skipped. The stream sequence field holds the largest stream * sequence number in this stream being skipped. The receiver of * the FWD-TSN's can use the Stream-N and Stream Sequence-N fields * to enable delivery of any stranded TSN's that remain on the stream * re-ordering queues. This field MUST NOT report TSN's corresponding * to DATA chunk that are marked as unordered. For ordered DATA * chunks this field MUST be filled in. */ struct sctp_fwdtsn_skip { __be16 stream; __be16 ssn; }; struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_fwdtsn_skip skip[]; */ }; struct sctp_fwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_fwdtsn_hdr fwdtsn_hdr; }; struct sctp_ifwdtsn_skip { __be16 stream; __u8 reserved; __u8 flags; __be32 mid; }; struct sctp_ifwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_ifwdtsn_skip skip[]; */ }; struct sctp_ifwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ifwdtsn_hdr fwdtsn_hdr; }; /* ADDIP * Section 3.1.1 Address Configuration Change Chunk (ASCONF) * * Serial Number: 32 bits (unsigned integer) * This value represents a Serial Number for the ASCONF Chunk. The * valid range of Serial Number is from 0 to 2^32-1. * Serial Numbers wrap back to 0 after reaching 2^32 -1. * * Address Parameter: 8 or 20 bytes (depending on type) * The address is an address of the sender of the ASCONF chunk, * the address MUST be considered part of the association by the * peer endpoint. This field may be used by the receiver of the * ASCONF to help in finding the association. This parameter MUST * be present in every ASCONF message i.e. it is a mandatory TLV * parameter. * * ASCONF Parameter: TLV format * Each Address configuration change is represented by a TLV * parameter as defined in Section 3.2. One or more requests may * be present in an ASCONF Chunk. * * Section 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * * Serial Number: 32 bits (unsigned integer) * This value represents the Serial Number for the received ASCONF * Chunk that is acknowledged by this chunk. This value is copied * from the received ASCONF Chunk. * * ASCONF Parameter Response: TLV format * The ASCONF Parameter Response is used in the ASCONF-ACK to * report status of ASCONF processing. */ struct sctp_addip_param { struct sctp_paramhdr param_hdr; __be32 crr_id; }; struct sctp_addiphdr { __be32 serial; /* __u8 params[]; */ }; struct sctp_addip_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_addiphdr addip_hdr; }; /* AUTH * Section 4.1 Authentication Chunk (AUTH) * * This chunk is used to hold the result of the HMAC calculation. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x0F | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Shared Key Identifier | HMAC Identifier | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ HMAC / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Type: 1 byte (unsigned integer) * This value MUST be set to 0x0F for all AUTH-chunks. * * Flags: 1 byte (unsigned integer) * Set to zero on transmit and ignored on receipt. * * Length: 2 bytes (unsigned integer) * This value holds the length of the HMAC in bytes plus 8. * * Shared Key Identifier: 2 bytes (unsigned integer) * This value describes which endpoint pair shared key is used. * * HMAC Identifier: 2 bytes (unsigned integer) * This value describes which message digest is being used. Table 2 * shows the currently defined values. * * The following Table 2 shows the currently defined values for HMAC * identifiers. * * +-----------------+--------------------------+ * | HMAC Identifier | Message Digest Algorithm | * +-----------------+--------------------------+ * | 0 | Reserved | * | 1 | SHA-1 defined in [8] | * | 2 | Reserved | * | 3 | SHA-256 defined in [8] | * +-----------------+--------------------------+ * * * HMAC: n bytes (unsigned integer) This hold the result of the HMAC * calculation. */ struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; /* __u8 hmac[]; */ }; struct sctp_auth_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_authhdr auth_hdr; }; struct sctp_infox { struct sctp_info *sctpinfo; struct sctp_association *asoc; }; struct sctp_reconf_chunk { struct sctp_chunkhdr chunk_hdr; /* __u8 params[]; */ }; struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be32 response_seq; __be32 send_reset_at_tsn; __be16 list_of_streams[]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 list_of_streams[]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 number_of_streams; __be16 reserved; }; enum { SCTP_STRRESET_NOTHING_TO_DO = 0x00, SCTP_STRRESET_PERFORMED = 0x01, SCTP_STRRESET_DENIED = 0x02, SCTP_STRRESET_ERR_WRONG_SSN = 0x03, SCTP_STRRESET_ERR_IN_PROGRESS = 0x04, SCTP_STRRESET_ERR_BAD_SEQNO = 0x05, SCTP_STRRESET_IN_PROGRESS = 0x06, }; struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; __be32 senders_next_tsn; __be32 receivers_next_tsn; }; enum { SCTP_DSCP_SET_MASK = 0x1, SCTP_DSCP_VAL_MASK = 0xfc, SCTP_FLOWLABEL_SET_MASK = 0x100000, SCTP_FLOWLABEL_VAL_MASK = 0xfffff }; /* UDP Encapsulation * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4 * * The error cause indicating an "Restart of an Association with * New Encapsulation Port" * * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cause Code = 14 | Cause Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Current Encapsulation Port | New Encapsulation Port | * +-------------------------------+-------------------------------+ */ struct sctp_new_encap_port_hdr { __be16 cur_port; __be16 new_port; }; /* Round an int up to the next multiple of 4. */ #define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ #define SCTP_TRUNC4(s) ((s)&~3) #endif /* __LINUX_SCTP_H__ */ |
| 57 59 71 71 73 1 2 71 71 71 8 50 14 57 57 12 419 419 221 223 21313 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * * Based on the original implementation which is: * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Parts of the original code have been moved to arch/x86/vdso/vma.c * * This file implements vsyscall emulation. vsyscalls are a legacy ABI: * Userspace can request certain kernel services by calling fixed * addresses. This concept is problematic: * * - It interferes with ASLR. * - It's awkward to write code that lives in kernel addresses but is * callable by userspace at fixed addresses. * - The whole concept is impossible for 32-bit compat userspace. * - UML cannot easily virtualize a vsyscall. * * As of mid-2014, I believe that there is no new userspace code that * will use a vsyscall if the vDSO is present. I hope that there will * soon be no new userspace code that will ever use a vsyscall. * * The code in this file emulates vsyscalls when notified of a page * fault to a vsyscall address. */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/mm_types.h> #include <linux/syscalls.h> #include <linux/ratelimit.h> #include <asm/vsyscall.h> #include <asm/unistd.h> #include <asm/fixmap.h> #include <asm/traps.h> #include <asm/paravirt.h> #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) XONLY; #else #error VSYSCALL config is broken #endif static int __init vsyscall_setup(char *str) { if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; else if (!strcmp("xonly", str)) vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else return -EINVAL; return 0; } return -EINVAL; } early_param("vsyscall", vsyscall_setup); static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { if (!show_unhandled_signals) return; printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n", level, current->comm, task_pid_nr(current), message, regs->ip, regs->cs, regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) { int nr; if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; if (nr >= 3) return -EINVAL; return nr; } static bool write_ok_or_segv(unsigned long ptr, size_t size) { if (!access_ok((void __user *)ptr, size)) { struct thread_struct *thread = ¤t->thread; thread->error_code = X86_PF_USER | X86_PF_WRITE; thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } bool emulate_vsyscall(unsigned long error_code, struct pt_regs *regs, unsigned long address) { unsigned long caller; int vsyscall_nr, syscall_nr, tmp; long ret; unsigned long orig_dx; /* Write faults or kernel-privilege faults never get fixed up. */ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; /* * Assume that faults at regs->ip are because of an * instruction fetch. Return early and avoid * emulation for faults during data accesses: */ if (address != regs->ip) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; /* * User code tried and failed to read the vsyscall page. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); return false; } /* * X86_PF_INSTR is only set when NX is supported. When * available, use it to double-check that the emulation code * is only being used for instruction fetches: */ if (cpu_feature_enabled(X86_FEATURE_NX)) WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); return false; } vsyscall_nr = addr_to_vsyscall_nr(address); trace_emulate_vsyscall(vsyscall_nr); if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs, "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); goto sigsegv; } if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "vsyscall with bad stack (exploit attempt?)"); goto sigsegv; } /* * Check for access_ok violations and find the syscall nr. * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ switch (vsyscall_nr) { case 0: if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_gettimeofday; break; case 1: if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_time; break; case 2: if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) { ret = -EFAULT; goto check_fault; } syscall_nr = __NR_getcpu; break; } /* * Handle seccomp. regs->ip must be the original value. * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. * * We could optimize the seccomp disabled case, but performance * here doesn't matter. */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; if (tmp) goto do_ret; /* skip requested */ /* * With a real vsyscall, page faults cause SIGSEGV. */ ret = -EFAULT; switch (vsyscall_nr) { case 0: /* this decodes regs->di and regs->si on its own */ ret = __x64_sys_gettimeofday(regs); break; case 1: /* this decodes regs->di on its own */ ret = __x64_sys_time(regs); break; case 2: /* while we could clobber regs->dx, we didn't in the past... */ orig_dx = regs->dx; regs->dx = 0; /* this decodes regs->di, regs->si and regs->dx on its own */ ret = __x64_sys_getcpu(regs); regs->dx = orig_dx; break; } check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); goto sigsegv; } regs->ax = ret; do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; return true; sigsegv: force_sig(SIGSEGV); return true; } /* * A pseudo VMA to allow ptrace access for the vsyscall page. This only * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, .vm_flags = VM_READ | VM_EXEC, .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) return NULL; return &gate_vma; } int in_gate_area(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; return (addr >= vma->vm_start) && (addr < vma->vm_end); } /* * Use this when you have no reliable mm, typically from interrupt * context. It is less reliable than using a task's mm and may give * false positives. */ int in_gate_area_no_mm(unsigned long addr) { return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } /* * The VSYSCALL page is the only user-accessible page in the kernel address * range. Normally, the kernel page tables can have _PAGE_USER clear, but * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls * are enabled. * * Some day we may create a "minimal" vsyscall mode in which we emulate * vsyscalls but leave the page not present. If so, we skip calling * this. */ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); pmd = pmd_offset(pud, VSYSCALL_ADDR); set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); } void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); /* * For full emulation, the page needs to exist for real. In * execute-only mode, there is no PTE at all backing the vsyscall * page. */ if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } if (vsyscall_mode == XONLY) vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } |
| 1 1 39 39 39 39 53 26 26 26 39 39 39 2 1 2 2 2 1 1 1 1 1 64 64 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> #include "tun_vnet.h" #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) static struct proto tap_proto = { .name = "tap", .owner = THIS_MODULE, .obj_size = sizeof(struct tap_queue), }; #define TAP_NUM_DEVS (1U << MINORBITS) static LIST_HEAD(major_list); struct major_info { struct rcu_head rcu; dev_t major; struct idr minor_idr; spinlock_t minor_lock; const char *device_name; struct list_head next; }; #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } /* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ static int tap_enable_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { int err = -EINVAL; ASSERT_RTNL(); if (q->enabled) goto out; err = 0; rcu_assign_pointer(tap->taps[tap->numvtaps], q); q->queue_index = tap->numvtaps; q->enabled = true; tap->numvtaps++; out: return err; } /* Requires RTNL */ static int tap_set_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { if (tap->numqueues == MAX_TAP_QUEUES) return -EBUSY; rcu_assign_pointer(q->tap, tap); rcu_assign_pointer(tap->taps[tap->numvtaps], q); sock_hold(&q->sk); q->file = file; q->queue_index = tap->numvtaps; q->enabled = true; file->private_data = q; list_add_tail(&q->next, &tap->queue_list); tap->numvtaps++; tap->numqueues++; return 0; } static int tap_disable_queue(struct tap_queue *q) { struct tap_dev *tap; struct tap_queue *nq; ASSERT_RTNL(); if (!q->enabled) return -EINVAL; tap = rtnl_dereference(q->tap); if (tap) { int index = q->queue_index; BUG_ON(index >= tap->numvtaps); nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(tap->taps[index], nq); RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); q->enabled = false; tap->numvtaps--; } return 0; } /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. */ static void tap_put_queue(struct tap_queue *q) { struct tap_dev *tap; rtnl_lock(); tap = rtnl_dereference(q->tap); if (tap) { if (q->enabled) BUG_ON(tap_disable_queue(q)); tap->numqueues--; RCU_INIT_POINTER(q->tap, NULL); sock_put(&q->sk); list_del_init(&q->next); } rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); } /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function. */ static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) goto out; if (numvtaps == 1) goto single; /* Check if we can use flow to select a queue */ rxq = skb_get_hash(skb); if (rxq) { queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } if (likely(skb_rx_queue_recorded(skb))) { rxq = skb_get_rx_queue(skb); while (unlikely(rxq >= numvtaps)) rxq -= numvtaps; queue = rcu_dereference(tap->taps[rxq]); goto out; } single: queue = rcu_dereference(tap->taps[0]); out: return queue; } /* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL. */ void tap_del_queues(struct tap_dev *tap) { struct tap_queue *q, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { list_del_init(&q->next); RCU_INIT_POINTER(q->tap, NULL); if (q->enabled) tap->numvtaps--; tap->numqueues--; sock_put(&q->sk); } BUG_ON(tap->numvtaps); BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */ tap->numvtaps = MAX_TAP_QUEUES; } EXPORT_SYMBOL_GPL(tap_del_queues); rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; enum skb_drop_reason drop_reason; tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS; q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS; skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled. */ if (q->flags & IFF_VNET_HDR) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; if (IS_ERR(segs)) { drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop; } if (!segs) { if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } goto wake_up; } consume_skb(skb); skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; kfree_skb_reason(skb, drop_reason); kfree_skb_list_reason(next, drop_reason); break; } } } else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) { drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop; } if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } } wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); return RX_HANDLER_CONSUMED; drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); static struct major_info *tap_get_major(int major) { struct major_info *tap_major; list_for_each_entry_rcu(tap_major, &major_list, next) { if (tap_major->major == major) return tap_major; } return NULL; } int tap_get_minor(dev_t major, struct tap_dev *tap) { int retval = -ENOMEM; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { retval = -EINVAL; goto unlock; } spin_lock(&tap_major->minor_lock); retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC); if (retval >= 0) { tap->minor = retval; } else if (retval == -ENOSPC) { netdev_err(tap->dev, "Too many tap devices\n"); retval = -EINVAL; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return retval < 0 ? retval : 0; } EXPORT_SYMBOL_GPL(tap_get_minor); void tap_free_minor(dev_t major, struct tap_dev *tap) { struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { goto unlock; } spin_lock(&tap_major->minor_lock); if (tap->minor) { idr_remove(&tap_major->minor_idr, tap->minor); tap->minor = 0; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(tap_free_minor); static struct tap_dev *dev_get_by_tap_file(int major, int minor) { struct net_device *dev = NULL; struct tap_dev *tap; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(major); if (!tap_major) { tap = NULL; goto unlock; } spin_lock(&tap_major->minor_lock); tap = idr_find(&tap_major->minor_idr, minor); if (tap) { dev = tap->dev; dev_hold(dev); } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return tap; } static void tap_sock_write_space(struct sock *sk) { wait_queue_head_t *wqueue; if (!sock_writeable(sk) || !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } static void tap_sock_destruct(struct sock *sk) { struct tap_queue *q = container_of(sk, struct tap_queue, sk); ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb); } static int tap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct tap_dev *tap; struct tap_queue *q; int err = -ENODEV; rtnl_lock(); tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); if (!tap) goto err; err = -ENOMEM; q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tap_proto, 0); if (!q) goto err; if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) { sk_free(&q->sk); goto err; } init_waitqueue_head(&q->sock.wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; sock_init_data_uid(&q->sock, &q->sk, current_fsuid()); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); /* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly. */ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(&q->sk, SOCK_ZEROCOPY); err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put; } /* tap groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; dev_put(tap->dev); rtnl_unlock(); return err; err_put: sock_put(&q->sk); err: if (tap) dev_put(tap->dev); rtnl_unlock(); return err; } static int tap_release(struct inode *inode, struct file *file) { struct tap_queue *q = file->private_data; tap_put_queue(q); return 0; } static __poll_t tap_poll(struct file *file, poll_table *wait) { struct tap_queue *q = file->private_data; __poll_t mask = EPOLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wq.wait, wait); if (!ptr_ring_empty(&q->ring)) mask |= EPOLLIN | EPOLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= EPOLLOUT | EPOLLWRNORM; out: return mask; } static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); unsigned long len = total_len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; int hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; size_t linear; enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); hdr_len = tun_vnet_hdr_get(vnet_hdr_len, q->flags, from, &vnet_hdr); if (hdr_len < 0) { err = hdr_len; goto err; } len -= vnet_hdr_len; } err = -EINVAL; if (unlikely(len < ETH_HLEN)) goto err; if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = clamp(hdr_len ?: GOODCOPY_LEN, ETH_HLEN, good_linear); linear = copylen; i = *from; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; linear = clamp(hdr_len, ETH_HLEN, good_linear); } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, linear, noblock, &err); if (!skb) goto err; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto err_kfree; } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; rcu_read_lock(); tap = rcu_dereference(q->tap); if (!tap) { kfree_skb(skb); rcu_read_unlock(); return total_len; } skb->dev = tap->dev; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, &vnet_hdr); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree; } } skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } dev_queue_xmit(skb); rcu_read_unlock(); return total_len; err_kfree: kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; return tap_get_user(q, NULL, from, noblock); } /* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, const struct sk_buff *skb, struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr); if (ret) return ret; ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr); if (ret) return ret; } total = vnet_hdr_len; total += skb->len; if (skb_vlan_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); total += VLAN_HLEN; ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } ret = skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); ssize_t ret = 0; if (!iov_iter_count(to)) { kfree_skb(skb); return 0; } if (skb) goto put; while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tap_do_read(q, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; return ret; } static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) { struct tap_dev *tap; ASSERT_RTNL(); tap = rtnl_dereference(q->tap); if (tap) dev_hold(tap->dev); return tap; } static void tap_put_tap_dev(struct tap_dev *tap) { dev_put(tap->dev); } static int tap_ioctl_set_queue(struct file *file, unsigned int flags) { struct tap_queue *q = file->private_data; struct tap_dev *tap; int ret; tap = tap_get_tap_dev(q); if (!tap) return -EINVAL; if (flags & IFF_ATTACH_QUEUE) ret = tap_enable_queue(tap, file, q); else if (flags & IFF_DETACH_QUEUE) ret = tap_disable_queue(q); else ret = -EINVAL; tap_put_tap_dev(tap); return ret; } static int set_offload(struct tap_queue *q, unsigned long arg) { struct tap_dev *tap; netdev_features_t features; netdev_features_t feature_mask = 0; tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK; features = tap->dev->features; if (arg & TUN_F_CSUM) { feature_mask = NETIF_F_HW_CSUM; if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4) feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } /* TODO: for now USO4 and USO6 should work simultaneously */ if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= NETIF_F_GSO_UDP_L4; } /* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) || (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; /* tap_features are the same as features on tun/tap and * reflect user expectations. */ tap->tap_features = feature_mask; if (tap->update_features) tap->update_features(tap, features); return 0; } /* * provide compatibility with generic tun/tap interface */ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tap_queue *q = file->private_data; struct tap_dev *tap; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; struct sockaddr_storage ss; int s; int ret; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = (q->flags & ~TAP_IFFEATURES) | u; return ret; case TUNGETIFF: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || put_user(u, &ifr->ifr_flags)) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; rtnl_lock(); ret = tap_ioctl_set_queue(file, u); rtnl_unlock(); return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL; q->sk.sk_sndbuf = s; return 0; case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO | TUN_F_USO4 | TUN_F_USO6)) return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); return ret; case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; netif_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: if (copy_from_user(&ss, &ifr->ifr_hwaddr, sizeof(ifr->ifr_hwaddr))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } if (tap->dev->addr_len > sizeof(ifr->ifr_hwaddr)) ret = -EINVAL; else ret = dev_set_mac_address_user(tap->dev, &ss, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; default: return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp); } } static const struct file_operations tap_fops = { .owner = THIS_MODULE, .open = tap_open, .release = tap_release, .read_iter = tap_read_iter, .write_iter = tap_write_iter, .poll = tap_poll, .unlocked_ioctl = tap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) { struct virtio_net_hdr *gso = xdp->data_hard_start; int buflen = xdp->frame_sz; int vnet_hdr_len = 0; struct tap_dev *tap; struct sk_buff *skb; int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { err = -EINVAL; goto err; } if (q->flags & IFF_VNET_HDR) vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { err = -ENOMEM; goto err; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = tun_vnet_hdr_to_skb(q->flags, skb, gso); if (err) goto err_kfree; } /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return 0; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; int i; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); } return 0; } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct sk_buff *skb = m->msg_control; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { kfree_skb(skb); return -EINVAL; } ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } return ret; } static int tap_peek_len(struct socket *sock) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tap_socket_ops = { .sendmsg = tap_sendmsg, .recvmsg = tap_recvmsg, .peek_len = tap_peek_len, }; /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring; } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, __skb_array_destroy_skb); kfree(rings); return ret; } EXPORT_SYMBOL_GPL(tap_queue_resize); static int tap_list_add(dev_t major, const char *device_name) { struct major_info *tap_major; tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC); if (!tap_major) return -ENOMEM; tap_major->major = MAJOR(major); idr_init(&tap_major->minor_idr); spin_lock_init(&tap_major->minor_lock); tap_major->device_name = device_name; list_add_tail_rcu(&tap_major->next, &major_list); return 0; } int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, const char *device_name, struct module *module) { int err; err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); if (err) goto out1; cdev_init(tap_cdev, &tap_fops); tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; err = tap_list_add(*tap_major, device_name); if (err) goto out3; return 0; out3: cdev_del(tap_cdev); out2: unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); out1: return err; } EXPORT_SYMBOL_GPL(tap_create_cdev); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) { struct major_info *tap_major, *tmp; cdev_del(tap_cdev); unregister_chrdev_region(major, TAP_NUM_DEVS); list_for_each_entry_safe(tap_major, tmp, &major_list, next) { if (tap_major->major == MAJOR(major)) { idr_destroy(&tap_major->minor_idr); list_del_rcu(&tap_major->next); kfree_rcu(tap_major, rcu); } } } EXPORT_SYMBOL_GPL(tap_destroy_cdev); MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("NETDEV_INTERNAL"); |
| 72 72 107 72 51 4 218 132 11 62 1629 11 17 5 45 10 65 56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; u16 sport_mask; u16 dport_mask; u8 iif_is_l3_master; u8 oif_is_l3_master; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_match(const struct fib_rule_port_range *range, u16 port_mask, __be16 port) { if ((range->start ^ ntohs(port)) & port_mask) return false; if (!port_mask && fib_rule_port_range_set(range) && !fib_rule_port_inrange(range, port)) return false; return true; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_port_is_range(const struct fib_rule_port_range *range) { return range->start != range->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(const struct net *net, int family); int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held); int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif |
| 96 96 61 5 96 96 61 61 61 61 61 7 1 6 6 8 1 7 1 1 2 12 67 2 67 12 12 55 61 1 4 3 1 1 4 3 1 3 5 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 | // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> */ #define pr_fmt(fmt) "PM: hibernation: " fmt #include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/async.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pm.h> #include <linux/nmi.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/ktime.h> #include <linux/security.h> #include <linux/secretmem.h> #include <trace/events/power.h> #include "power.h" static int nocompress; static int noresume; static int nohibernate; static int resume_wait; static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP; /* * Compression/decompression algorithm to be used while saving/loading * image to/from disk. This would later be used in 'kernel/power/swap.c' * to allocate comp streams. */ char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, HIBERNATION_SHUTDOWN, HIBERNATION_REBOOT, #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; #define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) #define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); #ifdef CONFIG_SUSPEND /** * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend */ bool pm_hibernation_mode_is_suspend(void) { return hibernation_mode == HIBERNATION_SUSPEND; } EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); #endif bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); } void hibernate_release(void) { atomic_inc(&hibernate_atomic); } bool hibernation_in_progress(void) { return !atomic_read(&hibernate_atomic); } bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && !secretmem_active() && !cxl_mem_active(); } /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { unsigned int sleep_flags; if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } sleep_flags = lock_system_sleep(); hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); static bool entering_platform_hibernation; bool system_entering_hibernation(void) { return entering_platform_hibernation; } EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG static unsigned int pm_test_delay = 5; module_param(pm_test_delay, uint, 0644); MODULE_PARM_DESC(pm_test_delay, "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { pr_info("hibernation debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) { if (pm_test_level == level) { hibernation_debug_sleep(); return 1; } return 0; } #else /* !CONFIG_PM_DEBUG */ static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ /** * platform_begin - Call platform to start hibernation. * @platform_mode: Whether or not to use the platform driver. */ static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->begin(PMSG_FREEZE) : 0; } /** * platform_end - Call platform to finish transition to the working state. * @platform_mode: Whether or not to use the platform driver. */ static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->end(); } /** * platform_pre_snapshot - Call platform to prepare the machine for hibernation. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to prepare the system for creating a hibernate image, * if so configured, and return an error code if that fails. */ static int platform_pre_snapshot(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_snapshot() : 0; } /** * platform_leave - Call platform to prepare a transition to the working state. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver prepare to prepare the machine for switching to the * normal mode of operation. * * This routine is called on one CPU with interrupts disabled. */ static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->leave(); } /** * platform_finish - Call platform to switch the system to the working state. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to switch the machine to the normal mode of * operation. * * This routine must be called after platform_prepare(). */ static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->finish(); } /** * platform_pre_restore - Prepare for hibernate image restoration. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to prepare the system for resume from a hibernation * image. * * If the restore fails after this function has been called, * platform_restore_cleanup() must be called. */ static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_restore() : 0; } /** * platform_restore_cleanup - Switch to the working state after failing restore. * @platform_mode: Whether or not to use the platform driver. * * Use the platform driver to switch the system to the normal mode of operation * after a failing restore. * * If platform_pre_restore() has been called before the failing restore, this * function must be called too, regardless of the result of * platform_pre_restore(). */ static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->restore_cleanup(); } /** * platform_recover - Recover from a failure to suspend devices. * @platform_mode: Whether or not to use the platform driver. */ static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) hibernation_ops->recover(); } /** * swsusp_show_speed - Print time elapsed between two events during hibernation. * @start: Starting event. * @stop: Final event. * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ void swsusp_show_speed(ktime_t start, ktime_t stop, unsigned nr_pages, char *msg) { ktime_t diff; u64 elapsed_centisecs64; unsigned int centisecs; unsigned int k; unsigned int kps; diff = ktime_sub(stop, start); elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); } __weak int arch_resume_nosmt(void) { return 0; } /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "late" and "noirq" freeze callbacks, create a * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ static int create_image(int platform_mode) { int error; error = dpm_suspend_end(PMSG_FREEZE); if (error) { pr_err("Some devices failed to power down, aborting\n"); return error; } error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; error = pm_sleep_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) { pr_err("Some system devices failed to power down, aborting\n"); goto Enable_irqs; } if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); /* Restore control flow magically appears here */ restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) pr_err("Error %d creating image\n", error); if (!in_suspend) { events_check_enabled = false; clear_or_poison_free_pages(); } platform_leave(platform_mode); Power_up: syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); /* Allow architectures to do nosmt-specific post-resume dances */ if (!in_suspend) error = arch_resume_nosmt(); Platform_finish: platform_finish(platform_mode); dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; } static void shrink_shmem_memory(void) { struct sysinfo info; unsigned long nr_shmem_pages, nr_freed_pages; si_meminfo(&info); nr_shmem_pages = info.sharedram; /* current page count used for shmem */ /* * The intent is to reclaim all shmem pages. Though shrink_all_memory() can * only reclaim about half of them, it's enough for creating the hibernation * image. */ nr_freed_pages = shrink_all_memory(nr_shmem_pages); pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", nr_shmem_pages, nr_freed_pages); } /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { pm_message_t msg; int error; pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) goto Close; error = freeze_kernel_threads(); if (error) goto Cleanup; if (hibernation_test(TEST_FREEZER)) { /* * Indicate to the caller that we are returning due to a * successful freezer test. */ freezer_test_done = true; goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); goto Thaw; } /* * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem * pages will use lots of system memory, causing hibernation image creation * fail due to insufficient free memory. * This call is to force flush the shmem pages to swap disk and reclaim * the system memory so that image creation can succeed. */ shrink_shmem_memory(); console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); if (error || hibernation_test(TEST_DEVICES)) platform_recover(platform_mode); else error = create_image(platform_mode); /* * In the case that we call create_image() above, the control * returns here (1) after the image has been created or the * image creation has failed and (2) after a successful restore. */ /* We may need to release the preallocated image pages here. */ if (error || !in_suspend) swsusp_free(); msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; dpm_resume(msg); if (error || !in_suspend) pm_restore_gfp_mask(); console_resume_all(); dpm_complete(msg); Close: platform_end(platform_mode); return error; Thaw: thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; } int __weak hibernate_resume_nonboot_cpu_disable(void) { return suspend_disable_secondary_cpus(); } /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "noirq" and "late" freeze callbacks, restore the * contents of highmem that have not been restored yet from the image and run * the low-level code that will restore the remaining contents of memory and * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; error = dpm_suspend_end(PMSG_QUIESCE); if (error) { pr_err("Some devices failed to power down, aborting resume\n"); return error; } error = platform_pre_restore(platform_mode); if (error) goto Cleanup; cpuidle_pause(); error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) goto Enable_irqs; save_processor_state(); error = restore_highmem(); if (!error) { error = swsusp_arch_resume(); /* * The code below is only ever reached in case of a failure. * Otherwise, execution continues at the place where * swsusp_arch_suspend() was called. */ BUG_ON(!error); /* * This call to restore_highmem() reverts the changes made by * the previous one. */ restore_highmem(); } /* * The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid * subsequent failures. */ swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); dpm_resume_start(PMSG_RECOVER); return error; } /** * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with system_transition_mutex held. If it is * successful, control reappears in the restored target kernel in * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { int error; pm_prepare_console(); console_suspend_all(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); /* * The above should either succeed and jump to the new kernel, * or return with an error. Otherwise things are just * undefined, so let's be paranoid. */ BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); console_resume_all(); pm_restore_console(); return error; } /** * hibernation_platform_enter - Power off the system using the platform driver. */ int hibernation_platform_enter(void) { int error; if (!hibernation_ops) return -ENOSYS; /* * We have cancelled the power transition by running * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; entering_platform_hibernation = true; console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; error = hibernation_ops->prepare(); if (error) goto Platform_finish; error = pm_sleep_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) goto Enable_irqs; if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } hibernation_ops->enter(); /* We should never get here */ while (1); Power_up: syscore_resume(); Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: pm_sleep_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); console_resume_all(); Close: hibernation_ops->end(); return error; } /** * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured, to put the system into the sleep * state corresponding to hibernation, or try to power it off or reboot, * depending on the value of hibernation_mode. */ static void power_down(void) { int error; #ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (!error) goto exit; hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : HIBERNATION_SHUTDOWN; } #endif switch (hibernation_mode) { case HIBERNATION_REBOOT: kernel_restart(NULL); break; case HIBERNATION_PLATFORM: error = hibernation_platform_enter(); if (error == -EAGAIN || error == -EBUSY) { events_check_enabled = false; pr_info("Wakeup event detected during hibernation, rolling back.\n"); goto exit; } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) { entering_platform_hibernation = true; kernel_power_off(); entering_platform_hibernation = false; } break; } kernel_halt(); /* * Valid image is on the disk, if we continue we risk serious data * corruption after resume. */ pr_crit("Power down manually\n"); while (1) cpu_relax(); exit: /* Restore swap signature. */ error = swsusp_unmark(); if (error) pr_err("Swap will be unusable! Try swapon -a.\n"); } static int load_image_and_restore(void) { int error; unsigned int flags; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { swsusp_close(); goto Unlock; } error = swsusp_read(&flags); swsusp_close(); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); pr_err("Failed to load image, recovering.\n"); swsusp_free(); free_basic_memory_bitmaps(); Unlock: unlock_device_hotplug(); return error; } #define COMPRESSION_ALGO_LZO "lzo" #define COMPRESSION_ALGO_LZ4 "lz4" /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { bool snapshot_test = false; unsigned int sleep_flags; int error; if (!hibernation_available()) { pm_pr_dbg("Hibernation not available.\n"); return -EPERM; } /* * Query for the compression algorithm support if compression is enabled. */ if (!nocompress) { strscpy(hib_comp_algo, hibernate_compressor); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } } sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } pr_info("hibernation entry\n"); pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); if (error) goto Restore; ksys_sync_helper(); filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) goto Exit; lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; if (nocompress) { flags |= SF_NOCOMPRESS_MODE; } else { flags |= SF_CRC32_MODE; /* * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4 * to override this behaviour and use LZ4. * * Refer kernel/power/power.h for more details */ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4)) flags |= SF_COMPRESSION_ALG_LZ4; else flags |= SF_COMPRESSION_ALG_LZO; } pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { if (hibernation_mode == HIBERNATION_TEST_RESUME) snapshot_test = true; else power_down(); } in_suspend = 0; pm_restore_gfp_mask(); } else { pm_pr_dbg("Hibernation image restored successfully.\n"); } Free_bitmaps: free_basic_memory_bitmaps(); Thaw: unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); error = swsusp_check(false); if (!error) error = load_image_and_restore(); } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); hibernate_release(); Unlock: unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; } /** * hibernate_quiet_exec - Execute a function with all devices frozen. * @func: Function to execute. * @data: Data pointer to pass to @func. * * Return the @func return value or an error code if it cannot be executed. */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { unsigned int sleep_flags; int error; sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; goto unlock; } pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); if (error) goto restore; filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) goto exit; lock_device_hotplug(); pm_suspend_clear_flags(); error = platform_begin(true); if (error) goto thaw; error = freeze_kernel_threads(); if (error) goto thaw; error = dpm_prepare(PMSG_FREEZE); if (error) goto dpm_complete; console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) goto dpm_resume; error = dpm_suspend_end(PMSG_FREEZE); if (error) goto dpm_resume; error = platform_pre_snapshot(true); if (error) goto skip; error = func(data); skip: platform_finish(true); dpm_resume_start(PMSG_THAW); dpm_resume: dpm_resume(PMSG_THAW); console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); thaw_kernel_threads(); thaw: platform_end(true); unlock_device_hotplug(); thaw_processes(); exit: filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: pm_restore_console(); hibernate_release(); unlock: unlock_system_sleep(sleep_flags); return error; } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); static int __init find_resume_device(void) { if (!strlen(resume_file)) return -ENOENT; pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); if (resume_delay) { pr_info("Waiting %dsec before reading resume device ...\n", resume_delay); ssleep(resume_delay); } /* Check if the device is there */ if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) return 0; /* * Some device discovery might still be in progress; we need to wait for * this to finish. */ wait_for_device_probe(); if (resume_wait) { while (early_lookup_bdev(resume_file, &swsusp_resume_device)) msleep(10); async_synchronize_full(); } return early_lookup_bdev(resume_file, &swsusp_resume_device); } static int software_resume(void) { int error; pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); mutex_lock(&system_transition_mutex); error = swsusp_check(true); if (error) goto Unlock; /* * Check if the hibernation image is compressed. If so, query for * the algorithm support. */ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); else strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; } } /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(); goto Unlock; } pr_info("resume from hibernation\n"); pm_prepare_console(); error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (error) goto Restore; filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); if (error) { filesystems_thaw(); goto Close_Finish; } error = freeze_kernel_threads(); if (error) { thaw_processes(); filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: pm_restore_console(); pr_info("resume failed (%d)\n", error); hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: swsusp_close(); goto Finish; } /** * software_resume_initcall - Resume from a saved hibernation image. * * This routine is called as a late initcall, when all devices have been * discovered and initialized already. * * The image reading code is called to see if there is a hibernation image * available for reading. If that is the case, devices are quiesced and the * contents of memory is restored from the saved image. * * If this is successful, control reappears in the restored target kernel in * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ static int __init software_resume_initcall(void) { /* * If the user said "noresume".. bail out early. */ if (noresume || !hibernation_available()) return 0; if (!swsusp_resume_device) { int error = find_resume_device(); if (error) return error; } return software_resume(); } late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { [HIBERNATION_PLATFORM] = "platform", [HIBERNATION_SHUTDOWN] = "shutdown", [HIBERNATION_REBOOT] = "reboot", #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif [HIBERNATION_TEST_RESUME] = "test_resume", }; /* * /sys/power/disk - Control hibernation mode. * * Hibernation can be handled in several ways. There are a few different ways * to put the system into the sleep state: using the platform driver (e.g. ACPI * or other hibernation_ops), powering it off or rebooting it (for testing * mostly). * * The sysfs file /sys/power/disk provides an interface for selecting the * hibernation mode to use. Reading from this file causes the available modes * to be printed. There are 3 modes that can be supported: * * 'platform' * 'shutdown' * 'reboot' * * If a platform hibernation driver is in use, 'platform' will be supported * and will be used by default. Otherwise, 'shutdown' will be used by default. * The selected option (i.e. the one corresponding to the current value of * hibernation_mode) is enclosed by a square bracket. * * To select a given hibernation mode it is necessary to write the mode's * string representation (as returned by reading from /sys/power/disk) back * into /sys/power/disk. */ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; int i; if (!hibernation_available()) return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) continue; switch (i) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) break; /* not a valid mode, continue with loop */ continue; } if (i == hibernation_mode) count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int mode = HIBERNATION_INVALID; unsigned int sleep_flags; int error = 0; int len; char *p; int i; if (!hibernation_available()) return -EPERM; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } } if (mode != HIBERNATION_INVALID) { switch (mode) { case HIBERNATION_SHUTDOWN: case HIBERNATION_REBOOT: #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: if (hibernation_ops) hibernation_mode = mode; else error = -EINVAL; } } else error = -EINVAL; if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; int len = n; char *name; dev_t dev; int error; if (!hibernation_available()) return n; if (len && buf[len-1] == '\n') len--; name = kstrndup(buf, len, GFP_KERNEL); if (!name) return -ENOMEM; error = lookup_bdev(name, &dev); if (error) { unsigned maj, min, offset; char *p, dummy; error = 0; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { dev = MKDEV(maj, min); if (maj != MAJOR(dev) || min != MINOR(dev)) error = -EINVAL; } else { dev = new_decode_dev(simple_strtoul(name, &p, 16)); if (*p) error = -EINVAL; } } kfree(name); if (error) return error; sleep_flags = lock_system_sleep(); swsusp_resume_device = dev; unlock_system_sleep(sleep_flags); pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; software_resume(); return n; } power_attr(resume); static ssize_t resume_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); } static ssize_t resume_offset_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long long offset; int rc; rc = kstrtoull(buf, 0, &offset); if (rc) return rc; swsusp_resume_block = offset; return n; } power_attr(resume_offset); static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long size; if (sscanf(buf, "%lu", &size) == 1) { image_size = size; return n; } return -EINVAL; } power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long size; if (sscanf(buf, "%lu", &size) == 1) { reserved_size = size; return n; } return -EINVAL; } power_attr(reserved_size); static struct attribute *g[] = { &disk_attr.attr, &resume_offset_attr.attr, &resume_attr.attr, &image_size_attr.attr, &reserved_size_attr.attr, NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static int __init pm_disk_init(void) { return sysfs_create_group(power_kobj, &attr_group); } core_initcall(pm_disk_init); static int __init resume_setup(char *str) { if (noresume) return 1; strscpy(resume_file, str); return 1; } static int __init resume_offset_setup(char *str) { unsigned long long offset; if (noresume) return 1; if (sscanf(str, "%llu", &offset) == 1) swsusp_resume_block = offset; return 1; } static int __init hibernate_setup(char *str) { if (!strncmp(str, "noresume", 8)) { noresume = 1; } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && !strncmp(str, "protect_image", 13)) { enable_restore_image_protection(); } return 1; } static int __init noresume_setup(char *str) { noresume = 1; return 1; } static int __init resumewait_setup(char *str) { resume_wait = 1; return 1; } static int __init resumedelay_setup(char *str) { int rc = kstrtouint(str, 0, &resume_delay); if (rc) pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } static int __init nohibernate_setup(char *str) { noresume = 1; nohibernate = 1; return 1; } static const char * const comp_alg_enabled[] = { #if IS_ENABLED(CONFIG_CRYPTO_LZO) COMPRESSION_ALGO_LZO, #endif #if IS_ENABLED(CONFIG_CRYPTO_LZ4) COMPRESSION_ALGO_LZ4, #endif }; static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { int index, ret; if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { ret = param_set_copystring(comp_alg_enabled[index], kp); if (!ret) strscpy(hib_comp_algo, comp_alg_enabled[index]); } else { ret = index; } mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", compressor); return ret; } static const struct kernel_param_ops hibernate_compressor_param_ops = { .set = hibernate_compressor_param_set, .get = param_get_string, }; static struct kparam_string hibernate_compressor_param_string = { .maxlen = sizeof(hibernate_compressor), .string = hibernate_compressor, }; module_param_cb(compressor, &hibernate_compressor_param_ops, &hibernate_compressor_param_string, 0644); MODULE_PARM_DESC(compressor, "Compression algorithm to be used with hibernation"); __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); |
| 5 6 14 1 1 1 2 2 2 4 2 1 5 7 7 8 8 1 1 4 4 3 3 17 1 1 1 4 3 2 1 3 2 33 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | // SPDX-License-Identifier: GPL-2.0-only /* * File: pn_netlink.c * * Phonet netlink interface * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Remi Denis-Courmont */ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/phonet.h> #include <linux/slab.h> #include <net/sock.h> #include <net/phonet/pn_dev.h> /* Device address handling */ static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, u32 portid, u32 seq, int event); void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(1), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_addr(skb, ifindex, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_PHONET_IFADDR, err); } static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U8 }, }; static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; struct net_device *dev; struct ifaddrmsg *ifm; int err; u8 pnaddr; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, extack); if (err < 0) return err; ifm = nlmsg_data(nlh); if (tb[IFA_LOCAL] == NULL) return -EINVAL; pnaddr = nla_get_u8(tb[IFA_LOCAL]); if (pnaddr & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifm->ifa_index); if (!dev) { rcu_read_unlock(); return -ENODEV; } if (nlh->nlmsg_type == RTM_NEWADDR) err = phonet_address_add(dev, pnaddr); else err = phonet_address_del(dev, pnaddr); rcu_read_unlock(); if (!err) phonet_address_notify(net, nlh->nlmsg_type, ifm->ifa_index, pnaddr); return err; } static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, u32 portid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), 0); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_PHONET; ifm->ifa_prefixlen = 0; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_LINK; ifm->ifa_index = ifindex; if (nla_put_u8(skb, IFA_LOCAL, addr)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { int addr_idx = 0, addr_start_idx = cb->args[1]; int dev_idx = 0, dev_start_idx = cb->args[0]; struct phonet_device_list *pndevs; struct phonet_device *pnd; int err = 0; pndevs = phonet_device_list(sock_net(skb->sk)); rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { DECLARE_BITMAP(addrs, 64); u8 addr; if (dev_idx > dev_start_idx) addr_start_idx = 0; if (dev_idx++ < dev_start_idx) continue; addr_idx = 0; memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); for_each_set_bit(addr, addrs, 64) { if (addr_idx++ < addr_start_idx) continue; err = fill_addr(skb, READ_ONCE(pnd->netdev->ifindex), addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR); if (err < 0) goto out; } } out: rcu_read_unlock(); cb->args[0] = dev_idx; cb->args[1] = addr_idx; return err; } /* Routes handling */ static int fill_route(struct sk_buff *skb, u32 ifindex, u8 dst, u32 portid, u32 seq, int event) { struct rtmsg *rtm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), 0); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_PHONET; rtm->rtm_dst_len = 6; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = RTPROT_STATIC; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_u8(skb, RTA_DST, dst) || nla_put_u32(skb, RTA_OIF, ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(1) + nla_total_size(4), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_route(skb, ifindex, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_PHONET_ROUTE, err); } static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_OIF] = { .type = NLA_U32 }, }; static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; bool sync_needed = false; struct net_device *dev; struct rtmsg *rtm; u32 ifindex; int err; u8 dst; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST) return -EINVAL; if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL) return -EINVAL; dst = nla_get_u8(tb[RTA_DST]); if (dst & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; ifindex = nla_get_u32(tb[RTA_OIF]); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { rcu_read_unlock(); return -ENODEV; } if (nlh->nlmsg_type == RTM_NEWROUTE) { err = phonet_route_add(dev, dst); } else { err = phonet_route_del(dev, dst); if (!err) sync_needed = true; } rcu_read_unlock(); if (sync_needed) { synchronize_rcu(); dev_put(dev); } if (!err) rtm_phonet_notify(net, nlh->nlmsg_type, ifindex, dst); return err; } static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err = 0; u8 addr; rcu_read_lock(); for (addr = cb->args[0]; addr < 64; addr++) { struct net_device *dev = phonet_route_get_rcu(net, addr << 2); if (!dev) continue; err = fill_route(skb, READ_ONCE(dev->ifindex), addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = addr; return err; } static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_module = { {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWADDR, .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELADDR, .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, .doit = route_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE, .doit = route_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETROUTE, .dumpit = route_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; int __init phonet_netlink_register(void) { return rtnl_register_many(phonet_rtnl_msg_handlers); } |
| 129 166 112 37 227 140 92 48 144 144 142 142 103 192 192 158 158 213 160 299 33 5 91 26 106 47 47 107 107 37 104 110 2 110 110 110 110 190 191 198 1 75 262 262 71 263 185 185 30 165 71 184 180 35 45 84 2 97 192 2 188 47 118 206 167 10 346 274 14 169 5 287 243 5 154 60 175 286 243 137 240 171 110 110 236 30 88 36 1 84 84 5 1 6 1 170 170 351 352 106 158 204 171 353 114 298 298 74 57 57 73 74 3 105 105 105 1 185 44 162 92 4 88 46 18 25 1 1 1 3 3 3 3 2 1 1 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 | /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; } |
| 63 1 28 26 24 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux NET3: Internet Group Management Protocol [IGMP] * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Extended to talk the BSD extended IGMP protocol of mrouted 3.6 */ #ifndef _LINUX_IGMP_H #define _LINUX_IGMP_H #include <linux/skbuff.h> #include <linux/timer.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/refcount.h> #include <linux/sockptr.h> #include <uapi/linux/igmp.h> static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb) { return (struct igmphdr *)skb_transport_header(skb); } static inline struct igmpv3_report * igmpv3_report_hdr(const struct sk_buff *skb) { return (struct igmpv3_report *)skb_transport_header(skb); } static inline struct igmpv3_query * igmpv3_query_hdr(const struct sk_buff *skb) { return (struct igmpv3_query *)skb_transport_header(skb); } struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; __be32 sl_addr[] __counted_by(sl_max); }; #define IP_SFBLOCK 10 /* allocate this many at once */ /* ip_mc_socklist is real list now. Speed is not argument; this list never used in fast path code */ struct ip_mc_socklist { struct ip_mc_socklist __rcu *next_rcu; struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip_sf_list { struct ip_sf_list *sf_next; unsigned long sf_count[2]; /* include/exclude counts */ __be32 sf_inaddr; unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ }; struct ip_mc_list { struct in_device *interface; __be32 multiaddr; unsigned int sfmode; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned long sfcount[2]; union { struct ip_mc_list *next; struct ip_mc_list __rcu *next_rcu; }; struct ip_mc_list __rcu *next_hash; struct timer_list timer; int users; refcount_t refcnt; spinlock_t lock; char tm_running; char reporter; char unsolicit_count; char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* V3 exponential field decoding */ #define IGMPV3_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define IGMPV3_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ ((IGMPV3_MASK(value, nbmant) | (1<<(nbmant))) << \ (IGMPV3_MASK((value) >> (nbmant), nbexp) + (nbexp)))) #define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value) #define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value) static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ip_transport_len(skb) < len) return 0; return pskb_may_pull(skb, len); } extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); extern void ip_mc_destroy_dev(struct in_device *); extern void ip_mc_up(struct in_device *); extern void ip_mc_down(struct in_device *); extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); static inline void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) { return __ip_mc_dec_group(in_dev, addr, GFP_KERNEL); } extern void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); int ip_mc_check_igmp(struct sk_buff *skb); #endif |
| 546 582 4 16 100 500 281 176 86 5242 2436 474 1853 640 5241 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <linux/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { u8 addr[ETH_ALEN]; eth_random_addr(addr); __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_set - Assign Ethernet address to a net_device * @dev: pointer to net_device structure * @addr: address to assign * * Assign given address to the net_device, addr_assign_type is not changed. */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, ETH_ALEN); } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; eth_hw_addr_set(dst, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask); } static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) { u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask); } static inline bool ether_addr_is_ip_mcast(const u8 *addr) { return ether_addr_is_ipv4_mcast(addr) || ether_addr_is_ipv6_mcast(addr); } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address. * * @offset: Offset to add. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_add(u8 *addr, long offset) { u64 u = ether_addr_to_u64(addr); u += offset; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_hw_addr_gen - Generate and assign Ethernet address to a port * @dev: pointer to port's net_device structure * @base_addr: base Ethernet address * @id: offset to add to the base address * * Generate a MAC address using a base address and an offset and assign it * to a net_device. Commonly used by switch drivers which need to compute * addresses for all their ports. addr_assign_type is not changed. */ static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, unsigned int id) { u64 u = ether_addr_to_u64(base_addr); u8 addr[ETH_ALEN]; u += id; u64_to_ether_addr(u, addr); eth_hw_addr_set(dev, addr); } /** * eth_skb_pkt_type - Assign packet type if destination address does not match * @skb: Assigned a packet type if address does not match @dev address * @dev: Network device used to compare packet address against * * If the destination MAC address of the packet does not match the network * device address, assign an appropriate packet type. */ static inline void eth_skb_pkt_type(struct sk_buff *skb, const struct net_device *dev) { const struct ethhdr *eth = eth_hdr(skb); if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) { if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { skb->pkt_type = PACKET_OTHERHOST; } } } static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) { struct ethhdr *eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); return eth; } /** * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */ |
| 40 2 27 17 190 214 49 295 7 1 1 3 33 22 19 49 4 45 41 17 35 41 29 38 42 24 17 16 16 11 12 17 16 11 16 4 2 6 36 20 15 15 35 14 3 2 5 6 9 42 2 47 39 23 47 49 49 49 5 5 5 5 4 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 | // SPDX-License-Identifier: GPL-2.0 /* * This module exports the functions: * * 'int set_selection_user(struct tiocl_selection __user *, * struct tty_struct *)' * 'int set_selection_kernel(struct tiocl_selection *, struct tty_struct *)' * 'void clear_selection(void)' * 'int paste_selection(struct tty_struct *)' * 'int sel_loadlut(u32 __user *)' * * Now that /dev/vcs exists, most of this can disappear again. */ #include <linux/module.h> #include <linux/tty.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/consolemap.h> #include <linux/selection.h> #include <linux/tiocl.h> #include <linux/console.h> #include <linux/tty_flip.h> #include <linux/sched/signal.h> /* Don't take this from <ctype.h>: 011-015 on the screen aren't spaces */ #define is_space_on_vt(c) ((c) == ' ') /* FIXME: all this needs locking */ static struct vc_selection { struct mutex lock; struct vc_data *cons; /* must not be deallocated */ char *buffer; unsigned int buf_len; volatile int start; /* cleared by clear_selection */ int end; } vc_sel = { .lock = __MUTEX_INITIALIZER(vc_sel.lock), .start = -1, }; /* clear_selection, highlight and highlight_pointer can be called from interrupt (via scrollback/front) */ /* set reverse video on characters s-e of console with selection. */ static inline void highlight(const int s, const int e) { invert_screen(vc_sel.cons, s, e-s+2, true); } /* use complementary color to show the pointer */ static inline void highlight_pointer(const int where) { complement_pos(vc_sel.cons, where); } static u32 sel_pos(int n, bool unicode) { if (unicode) return screen_glyph_unicode(vc_sel.cons, n / 2); return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n), false); } /** * clear_selection - remove current selection * * Remove the current selection highlight, if any from the console holding the * selection. * * Locking: The caller must hold the console lock. */ void clear_selection(void) { highlight_pointer(-1); /* hide the pointer */ if (vc_sel.start != -1) { highlight(vc_sel.start, vc_sel.end); vc_sel.start = -1; } } EXPORT_SYMBOL_GPL(clear_selection); bool vc_is_sel(const struct vc_data *vc) { return vc == vc_sel.cons; } /* * User settable table: what characters are to be considered alphabetic? * 128 bits. Locked by the console lock. */ static u32 inwordLut[]={ 0x00000000, /* control chars */ 0x03FFE000, /* digits and "-./" */ 0x87FFFFFE, /* uppercase and '_' */ 0x07FFFFFE, /* lowercase */ }; static inline int inword(const u32 c) { return c > 0x7f || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1); } /** * sel_loadlut() - load the LUT table * @lut: user table * * Load the LUT table from user space. Make a temporary copy so a partial * update doesn't make a mess. * * Locking: The console lock is acquired. */ int sel_loadlut(u32 __user *lut) { u32 tmplut[ARRAY_SIZE(inwordLut)]; if (copy_from_user(tmplut, lut, sizeof(inwordLut))) return -EFAULT; guard(console_lock)(); memcpy(inwordLut, tmplut, sizeof(inwordLut)); return 0; } /* does screen address p correspond to character at LH/RH edge of screen? */ static inline int atedge(const int p, int size_row) { return (!(p % size_row) || !((p + 2) % size_row)); } /* stores the char in UTF8 and returns the number of bytes used (1-4) */ static int store_utf8(u32 c, char *p) { if (c < 0x80) { /* 0******* */ p[0] = c; return 1; } else if (c < 0x800) { /* 110***** 10****** */ p[0] = 0xc0 | (c >> 6); p[1] = 0x80 | (c & 0x3f); return 2; } else if (c < 0x10000) { /* 1110**** 10****** 10****** */ p[0] = 0xe0 | (c >> 12); p[1] = 0x80 | ((c >> 6) & 0x3f); p[2] = 0x80 | (c & 0x3f); return 3; } else if (c < 0x110000) { /* 11110*** 10****** 10****** 10****** */ p[0] = 0xf0 | (c >> 18); p[1] = 0x80 | ((c >> 12) & 0x3f); p[2] = 0x80 | ((c >> 6) & 0x3f); p[3] = 0x80 | (c & 0x3f); return 4; } else { /* outside Unicode, replace with U+FFFD */ p[0] = 0xef; p[1] = 0xbf; p[2] = 0xbd; return 3; } } /** * set_selection_user - set the current selection. * @sel: user selection info * @tty: the console tty * * Invoked by the ioctl handle for the vt layer. * * Locking: The entire selection process is managed under the console_lock. * It's a lot under the lock but its hardly a performance path. */ int set_selection_user(const struct tiocl_selection __user *sel, struct tty_struct *tty) { struct tiocl_selection v; if (copy_from_user(&v, sel, sizeof(*sel))) return -EFAULT; /* * TIOCL_SELCLEAR and TIOCL_SELPOINTER are OK to use without * CAP_SYS_ADMIN as they do not modify the selection. */ switch (v.sel_mode) { case TIOCL_SELCLEAR: case TIOCL_SELPOINTER: break; default: if (!capable(CAP_SYS_ADMIN)) return -EPERM; } return set_selection_kernel(&v, tty); } static int vc_selection_store_chars(struct vc_data *vc, bool unicode) { char *bp, *obp; unsigned int i; /* Allocate a new buffer before freeing the old one ... */ /* chars can take up to 4 bytes with unicode */ bp = kmalloc_array((vc_sel.end - vc_sel.start) / 2 + 1, unicode ? 4 : 1, GFP_KERNEL | __GFP_NOWARN); if (!bp) { printk(KERN_WARNING "selection: kmalloc() failed\n"); clear_selection(); return -ENOMEM; } kfree(vc_sel.buffer); vc_sel.buffer = bp; obp = bp; for (i = vc_sel.start; i <= vc_sel.end; i += 2) { u32 c = sel_pos(i, unicode); if (unicode) bp += store_utf8(c, bp); else *bp++ = c; if (!is_space_on_vt(c)) obp = bp; if (!((i + 2) % vc->vc_size_row)) { /* strip trailing blanks from line and add newline, unless non-space at end of line. */ if (obp != bp) { bp = obp; *bp++ = '\r'; } obp = bp; } } vc_sel.buf_len = bp - vc_sel.buffer; return 0; } static int vc_do_selection(struct vc_data *vc, unsigned short mode, int ps, int pe) { int new_sel_start, new_sel_end, spc; bool unicode = vt_do_kdgkbmode(fg_console) == K_UNICODE; switch (mode) { case TIOCL_SELCHAR: /* character-by-character selection */ new_sel_start = ps; new_sel_end = pe; break; case TIOCL_SELWORD: /* word-by-word selection */ spc = is_space_on_vt(sel_pos(ps, unicode)); for (new_sel_start = ps; ; ps -= 2) { if ((spc && !is_space_on_vt(sel_pos(ps, unicode))) || (!spc && !inword(sel_pos(ps, unicode)))) break; new_sel_start = ps; if (!(ps % vc->vc_size_row)) break; } spc = is_space_on_vt(sel_pos(pe, unicode)); for (new_sel_end = pe; ; pe += 2) { if ((spc && !is_space_on_vt(sel_pos(pe, unicode))) || (!spc && !inword(sel_pos(pe, unicode)))) break; new_sel_end = pe; if (!((pe + 2) % vc->vc_size_row)) break; } break; case TIOCL_SELLINE: /* line-by-line selection */ new_sel_start = rounddown(ps, vc->vc_size_row); new_sel_end = rounddown(pe, vc->vc_size_row) + vc->vc_size_row - 2; break; case TIOCL_SELPOINTER: highlight_pointer(pe); return 0; default: return -EINVAL; } /* remove the pointer */ highlight_pointer(-1); /* select to end of line if on trailing space */ if (new_sel_end > new_sel_start && !atedge(new_sel_end, vc->vc_size_row) && is_space_on_vt(sel_pos(new_sel_end, unicode))) { for (pe = new_sel_end + 2; ; pe += 2) if (!is_space_on_vt(sel_pos(pe, unicode)) || atedge(pe, vc->vc_size_row)) break; if (is_space_on_vt(sel_pos(pe, unicode))) new_sel_end = pe; } if (vc_sel.start == -1) /* no current selection */ highlight(new_sel_start, new_sel_end); else if (new_sel_start == vc_sel.start) { if (new_sel_end == vc_sel.end) /* no action required */ return 0; else if (new_sel_end > vc_sel.end) /* extend to right */ highlight(vc_sel.end + 2, new_sel_end); else /* contract from right */ highlight(new_sel_end + 2, vc_sel.end); } else if (new_sel_end == vc_sel.end) { if (new_sel_start < vc_sel.start) /* extend to left */ highlight(new_sel_start, vc_sel.start - 2); else /* contract from left */ highlight(vc_sel.start, new_sel_start - 2); } else /* some other case; start selection from scratch */ { clear_selection(); highlight(new_sel_start, new_sel_end); } vc_sel.start = new_sel_start; vc_sel.end = new_sel_end; return vc_selection_store_chars(vc, unicode); } static int vc_selection(struct vc_data *vc, struct tiocl_selection *v, struct tty_struct *tty) { int ps, pe; poke_blanked_console(); if (v->sel_mode == TIOCL_SELCLEAR) { /* useful for screendump without selection highlights */ clear_selection(); return 0; } v->xs = min_t(u16, v->xs - 1, vc->vc_cols - 1); v->ys = min_t(u16, v->ys - 1, vc->vc_rows - 1); v->xe = min_t(u16, v->xe - 1, vc->vc_cols - 1); v->ye = min_t(u16, v->ye - 1, vc->vc_rows - 1); if (mouse_reporting() && (v->sel_mode & TIOCL_SELMOUSEREPORT)) { mouse_report(tty, v->sel_mode & TIOCL_SELBUTTONMASK, v->xs, v->ys); return 0; } ps = v->ys * vc->vc_size_row + (v->xs << 1); pe = v->ye * vc->vc_size_row + (v->xe << 1); if (ps > pe) /* make vc_sel.start <= vc_sel.end */ swap(ps, pe); if (vc_sel.cons != vc) { clear_selection(); vc_sel.cons = vc; } return vc_do_selection(vc, v->sel_mode, ps, pe); } int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) { guard(mutex)(&vc_sel.lock); guard(console_lock)(); return vc_selection(vc_cons[fg_console].d, v, tty); } EXPORT_SYMBOL_GPL(set_selection_kernel); /* Insert the contents of the selection buffer into the * queue of the tty associated with the current console. * Invoked by ioctl(). * * Locking: called without locks. Calls the ldisc wrongly with * unsafe methods, */ int paste_selection(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; int pasted = 0; size_t count; struct tty_ldisc *ld; DECLARE_WAITQUEUE(wait, current); int ret = 0; bool bp = vc->vc_bracketed_paste; static const char bracketed_paste_start[] = "\033[200~"; static const char bracketed_paste_end[] = "\033[201~"; const char *bps = bp ? bracketed_paste_start : NULL; const char *bpe = bp ? bracketed_paste_end : NULL; scoped_guard(console_lock) poke_blanked_console(); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; /* ldisc was hung up */ tty_buffer_lock_exclusive(&vc->port); add_wait_queue(&vc->paste_wait, &wait); mutex_lock(&vc_sel.lock); while (vc_sel.buffer && (vc_sel.buf_len > pasted || bpe)) { set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { ret = -EINTR; break; } if (tty_throttled(tty)) { mutex_unlock(&vc_sel.lock); schedule(); mutex_lock(&vc_sel.lock); continue; } __set_current_state(TASK_RUNNING); if (bps) { bps += tty_ldisc_receive_buf(ld, bps, NULL, strlen(bps)); if (*bps != '\0') continue; bps = NULL; } count = vc_sel.buf_len - pasted; if (count) { pasted += tty_ldisc_receive_buf(ld, vc_sel.buffer + pasted, NULL, count); if (vc_sel.buf_len > pasted) continue; } if (bpe) { bpe += tty_ldisc_receive_buf(ld, bpe, NULL, strlen(bpe)); if (*bpe == '\0') bpe = NULL; } } mutex_unlock(&vc_sel.lock); remove_wait_queue(&vc->paste_wait, &wait); __set_current_state(TASK_RUNNING); tty_buffer_unlock_exclusive(&vc->port); tty_ldisc_deref(ld); return ret; } EXPORT_SYMBOL_GPL(paste_selection); |
| 110 167 167 167 167 3900 3903 110 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); wakeup_attr(relax_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_relax_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init); |
| 135 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpi-sub-ui.c - Subtract an unsigned integer from an MPI. * * Copyright 1991, 1993, 1994, 1996, 1999-2002, 2004, 2012, 2013, 2015 * Free Software Foundation, Inc. * * This file was based on the GNU MP Library source file: * https://gmplib.org/repo/gmp-6.2/file/510b83519d1c/mpz/aors_ui.h * * The GNU MP Library is free software; you can redistribute it and/or modify * it under the terms of either: * * * the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your * option) any later version. * * or * * * the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any * later version. * * or both in parallel, as here. * * The GNU MP Library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received copies of the GNU General Public License and the * GNU Lesser General Public License along with the GNU MP Library. If not, * see https://www.gnu.org/licenses/. */ #include <linux/export.h> #include "mpi-internal.h" int mpi_sub_ui(MPI w, MPI u, unsigned long vval) { if (u->nlimbs == 0) { if (mpi_resize(w, 1) < 0) return -ENOMEM; w->d[0] = vval; w->nlimbs = (vval != 0); w->sign = (vval != 0); return 0; } /* If not space for W (and possible carry), increase space. */ if (mpi_resize(w, u->nlimbs + 1)) return -ENOMEM; if (u->sign) { mpi_limb_t cy; cy = mpihelp_add_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); w->d[u->nlimbs] = cy; w->nlimbs = u->nlimbs + cy; w->sign = 1; } else { /* The signs are different. Need exact comparison to determine * which operand to subtract from which. */ if (u->nlimbs == 1 && u->d[0] < vval) { w->d[0] = vval - u->d[0]; w->nlimbs = 1; w->sign = 1; } else { mpihelp_sub_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); /* Size can decrease with at most one limb. */ w->nlimbs = (u->nlimbs - (w->d[u->nlimbs - 1] == 0)); w->sign = 0; } } mpi_normalize(w); return 0; } EXPORT_SYMBOL_GPL(mpi_sub_ui); |
| 28 2 26 3 9 1 4 4 31 1 9 21 12 2 24 1 7 17 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <linux/xattr.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "xattr.h" struct io_xattr { struct file *file; struct kernel_xattr_ctx ctx; struct filename *filename; }; void io_xattr_cleanup(struct io_kiocb *req) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); if (ix->filename) putname(ix->filename); kfree(ix->ctx.kname); kvfree(ix->ctx.kvalue); } static void io_xattr_finish(struct io_kiocb *req, int ret) { req->flags &= ~REQ_F_NEED_CLEANUP; io_xattr_cleanup(req); io_req_set_res(req, ret, 0); } static int __io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); const char __user *name; int ret; ix->filename = NULL; ix->ctx.kvalue = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ix->ctx.size = READ_ONCE(sqe->len); ix->ctx.flags = READ_ONCE(sqe->xattr_flags); if (ix->ctx.flags) return -EINVAL; ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); if (!ix->ctx.kname) return -ENOMEM; ret = import_xattr_name(ix->ctx.kname, name); if (ret) { kfree(ix->ctx.kname); return ret; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_getxattr_prep(req, sqe); } int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); const char __user *path; int ret; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; ret = __io_getxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); ix->filename = getname(path); if (IS_ERR(ix->filename)) return PTR_ERR(ix->filename); return 0; } int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); return IOU_COMPLETE; } int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); return IOU_COMPLETE; } static int __io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); const char __user *name; int ret; ix->filename = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ix->ctx.kvalue = NULL; ix->ctx.size = READ_ONCE(sqe->len); ix->ctx.flags = READ_ONCE(sqe->xattr_flags); ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); if (!ix->ctx.kname) return -ENOMEM; ret = setxattr_copy(name, &ix->ctx); if (ret) { kfree(ix->ctx.kname); return ret; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); const char __user *path; int ret; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; ret = __io_setxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); ix->filename = getname(path); if (IS_ERR(ix->filename)) return PTR_ERR(ix->filename); return 0; } int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { return __io_setxattr_prep(req, sqe); } int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); return IOU_COMPLETE; } int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); return IOU_COMPLETE; } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_blackhole.c Black hole queue * * Authors: Thomas Graf <tgraf@suug.ch> * * Note: Quantum tunneling is not supported. */ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { qdisc_drop(skb, sch, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) { return NULL; } static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = { .id = "blackhole", .priv_size = 0, .enqueue = blackhole_enqueue, .dequeue = blackhole_dequeue, .peek = blackhole_dequeue, .owner = THIS_MODULE, }; static int __init blackhole_init(void) { return register_qdisc(&blackhole_qdisc_ops); } device_initcall(blackhole_init) |
| 4291 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * cls_cgroup.h Control Group Classifier * * Authors: Thomas Graf <tgraf@suug.ch> */ #ifndef _NET_CLS_CGROUP_H #define _NET_CLS_CGROUP_H #include <linux/cgroup.h> #include <linux/hardirq.h> #include <linux/rcupdate.h> #include <net/sock.h> #include <net/inet_sock.h> #ifdef CONFIG_CGROUP_NET_CLASSID struct cgroup_cls_state { struct cgroup_subsys_state css; u32 classid; }; struct cgroup_cls_state *task_cls_state(struct task_struct *p); static inline u32 task_cls_classid(struct task_struct *p) { u32 classid; if (in_interrupt()) return 0; rcu_read_lock(); classid = container_of(task_css(p, net_cls_cgrp_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); return classid; } static inline void sock_update_classid(struct sock_cgroup_data *skcd) { u32 classid; classid = task_cls_classid(current); sock_cgroup_set_classid(skcd, classid); } static inline u32 __task_get_classid(struct task_struct *task) { return task_cls_state(task)->classid; } static inline u32 task_get_classid(const struct sk_buff *skb) { u32 classid = __task_get_classid(current); /* Due to the nature of the classifier it is required to ignore all * packets originating from softirq context as accessing `current' * would lead to false results. * * This test assumes that all callers of dev_queue_xmit() explicitly * disable bh. Knowing this, it is possible to detect softirq based * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ if (softirq_count()) { struct sock *sk = skb_to_full_sk(skb); /* If there is an sock_cgroup_classid we'll use that. */ if (!sk || !sk_fullsock(sk)) return 0; classid = sock_cgroup_classid(&sk->sk_cgrp_data); } return classid; } #else /* !CONFIG_CGROUP_NET_CLASSID */ static inline void sock_update_classid(struct sock_cgroup_data *skcd) { } static inline u32 task_get_classid(const struct sk_buff *skb) { return 0; } #endif /* CONFIG_CGROUP_NET_CLASSID */ #endif /* _NET_CLS_CGROUP_H */ |
| 198 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_FRAG_CACHE_H #define _LINUX_PAGE_FRAG_CACHE_H #include <linux/bits.h> #include <linux/log2.h> #include <linux/mm_types_task.h> #include <linux/types.h> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* Use a full byte here to enable assembler optimization as the shift * operation is usually expecting a byte. */ #define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) #else /* Compiler should be able to figure out we don't read things as any value * ANDed with 0 is 0. */ #define PAGE_FRAG_CACHE_ORDER_MASK 0 #endif #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT (PAGE_FRAG_CACHE_ORDER_MASK + 1) static inline bool encoded_page_decode_pfmemalloc(unsigned long encoded_page) { return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT); } static inline void page_frag_cache_init(struct page_frag_cache *nc) { nc->encoded_page = 0; } static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) { return encoded_page_decode_pfmemalloc(nc->encoded_page); } void page_frag_cache_drain(struct page_frag_cache *nc); void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask); static inline void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); } static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); } void page_frag_free(void *addr); #endif |
| 3 571 570 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> #include <linux/netdevice.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks"); module_param(enable_hooks, bool, 0000); unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); break; case NFPROTO_IPV6: seq_printf(s, "src=%pI6 dst=%pI6 ", tuple->src.u3.ip6, tuple->dst.u3.ip6); break; default: break; } switch (l4proto->l4proto) { case IPPROTO_ICMP: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_TCP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.tcp.port), ntohs(tuple->dst.u.tcp.port)); break; case IPPROTO_UDPLITE: case IPPROTO_UDP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.udp.port), ntohs(tuple->dst.u.udp.port)); break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), ntohs(tuple->dst.u.sctp.port)); break; case IPPROTO_ICMPV6: seq_printf(s, "type=%u code=%u id=%u ", tuple->dst.u.icmp.type, tuple->dst.u.icmp.code, ntohs(tuple->src.u.icmp.id)); break; case IPPROTO_GRE: seq_printf(s, "srckey=0x%x dstkey=0x%x ", ntohs(tuple->src.u.gre.key), ntohs(tuple->dst.u.gre.key)); break; default: break; } } EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, struct ct_iter_state *st) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int i; for (i = st->bucket; i < st->htable_size; i++) { unsigned int skip = 0; restart: hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); struct hlist_nulls_node *tmp = n; if (!net_eq(net, nf_ct_net(ct))) continue; if (++skip <= st->skip_elems) continue; /* h should be returned, skip to nulls marker. */ while (!is_a_nulls(tmp)) tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); /* check if h is still linked to hash[i] */ if (get_nulls_value(tmp) != i) { skip = 0; goto restart; } st->skip_elems = skip; st->bucket = i; return h; } skip = 0; if (get_nulls_value(n) != i) goto restart; st->skip_elems = 0; } st->bucket = i; return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); if (*pos == 0) { st->skip_elems = 0; st->bucket = 0; } else if (st->skip_elems) { /* resume from last dumped entry */ st->skip_elems--; } return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct ct_iter_state *st = s->private; struct net *net = seq_file_net(s); (*pos)++; return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) __releases(RCU) { rcu_read_unlock(); } #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { struct lsm_context ctx; int ret; ret = security_secid_to_secctx(ct->secmark, &ctx); if (ret < 0) return; seq_printf(s, "secctx=%s ", ctx.context); security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { } #endif #ifdef CONFIG_NF_CONNTRACK_ZONES static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); if (zone->dir != dir) return; switch (zone->dir) { case NF_CT_DEFAULT_ZONE_DIR: seq_printf(s, "zone=%u ", zone->id); break; case NF_CT_ZONE_DIR_ORIG: seq_printf(s, "zone-orig=%u ", zone->id); break; case NF_CT_ZONE_DIR_REPL: seq_printf(s, "zone-reply=%u ", zone->id); break; default: break; } } #else static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, int dir) { } #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { struct ct_iter_state *st = s->private; struct nf_conn_tstamp *tstamp; s64 delta_time; tstamp = nf_conn_tstamp_find(ct); if (tstamp) { delta_time = st->time_now - tstamp->start; if (delta_time > 0) delta_time = div_s64(delta_time, NSEC_PER_SEC); else delta_time = 0; seq_printf(s, "delta-time=%llu ", (unsigned long long)delta_time); } return; } #else static inline void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { } #endif static const char* l3proto_name(u16 proto) { switch (proto) { case AF_INET: return "ipv4"; case AF_INET6: return "ipv6"; } return "unknown"; } static const char* l4proto_name(u16 proto) { switch (proto) { case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; case IPPROTO_ICMPV6: return "icmpv6"; } return "unknown"; } static void seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) { struct nf_conn_acct *acct; struct nf_conn_counter *counter; acct = nf_conn_acct_find(ct); if (!acct) return; counter = acct->counter; seq_printf(s, "packets=%llu bytes=%llu ", (unsigned long long)atomic64_read(&counter[dir].packets), (unsigned long long)atomic64_read(&counter[dir].bytes)); } /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; WARN_ON(!ct); if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; /* load ->status after refcount increase */ smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { struct ct_iter_state *st = s->private; st->skip_elems--; nf_ct_kill(ct); goto release; } /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; if (!net_eq(nf_ct_net(ct), net)) goto release; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); if (seq_has_overflowed(s)) goto release; seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL); if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); seq_print_acct(s, ct, IP_CT_DIR_REPLY); if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[HW_OFFLOAD] "); else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) goto release; #if defined(CONFIG_NF_CONNTRACK_MARK) seq_printf(s, "mark=%u ", READ_ONCE(ct->mark)); #endif ct_show_secctx(s, ct); ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; ret = 0; release: nf_ct_put(ct); return ret; } static const struct seq_operations ct_seq_ops = { .start = ct_seq_start, .next = ct_seq_next, .stop = ct_seq_stop, .show = ct_seq_show }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } return NULL; } static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; return per_cpu_ptr(net->ct.stat, cpu); } (*pos)++; return NULL; } static void ct_cpu_seq_stop(struct seq_file *seq, void *v) { } static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); const struct ip_conntrack_stat *st = v; unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } nr_conntracks = nf_conntrack_count(net); seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, st->clash_resolve, st->found, 0, st->invalid, 0, 0, st->chaintoolong, st->insert, st->insert_failed, st->drop, st->early_drop, st->error, st->expect_new, st->expect_create, st->expect_delete, st->search_restart ); return 0; } static const struct seq_operations ct_cpu_seq_ops = { .start = ct_cpu_seq_start, .next = ct_cpu_seq_next, .stop = ct_cpu_seq_stop, .show = ct_cpu_seq_show, }; static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; kuid_t root_uid; kgid_t root_gid; pde = proc_create_net("nf_conntrack", 0440, net->proc_net, &ct_seq_ops, sizeof(struct ct_iter_state)); if (!pde) goto out_nf_conntrack; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); pde = proc_create_net("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_ops, sizeof(struct seq_net_private)); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: remove_proc_entry("nf_conntrack", net->proc_net); out_nf_conntrack: return -ENOMEM; } static void nf_conntrack_standalone_fini_proc(struct net *net) { remove_proc_entry("nf_conntrack", net->proc_net_stat); remove_proc_entry("nf_conntrack", net->proc_net); } #else static int nf_conntrack_standalone_init_proc(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_proc(struct net *net) { } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ u32 nf_conntrack_count(const struct net *net) { const struct nf_conntrack_net *cnet = nf_ct_pernet(net); return atomic_read(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_count); /* Sysctl support */ #ifdef CONFIG_SYSCTL /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* module_param hashsize could have changed value */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; /* update ret, we might not be able to satisfy request */ ret = nf_conntrack_hash_resize(nf_conntrack_htable_size_user); /* update it to the actual value used by conntrack */ nf_conntrack_htable_size_user = nf_conntrack_htable_size; return ret; } static int nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, i; ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; if (*(u8 *)table->data == 0) return 0; /* Load nf_log_syslog only if no logger is currently registered */ for (i = 0; i < NFPROTO_NUMPROTO; i++) { if (nf_log_is_registered(i)) return 0; } request_module("%s", "nf_log_syslog"); return 0; } static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { NF_SYSCTL_CT_MAX, NF_SYSCTL_CT_COUNT, NF_SYSCTL_CT_BUCKETS, NF_SYSCTL_CT_CHECKSUM, NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP NF_SYSCTL_CT_TIMESTAMP, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST, NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP, NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, #ifdef CONFIG_NF_CT_PROTO_SCTP NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif NF_SYSCTL_CT_LAST_SYSCTL, }; static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, [NF_SYSCTL_CT_BUCKETS] = { .procname = "nf_conntrack_buckets", .data = &nf_conntrack_htable_size_user, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = nf_conntrack_hash_sysctl, }, [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { .procname = "nf_conntrack_generic_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = { .procname = "nf_conntrack_tcp_timeout_syn_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = { .procname = "nf_conntrack_tcp_timeout_syn_recv", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = { .procname = "nf_conntrack_tcp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = { .procname = "nf_conntrack_tcp_timeout_fin_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = { .procname = "nf_conntrack_tcp_timeout_close_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = { .procname = "nf_conntrack_tcp_timeout_last_ack", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = { .procname = "nf_conntrack_tcp_timeout_time_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = { .procname = "nf_conntrack_tcp_timeout_close", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = { .procname = "nf_conntrack_tcp_timeout_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = { .procname = "nf_conntrack_tcp_timeout_unacknowledged", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = { .procname = "nf_flowtable_tcp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = { .procname = "nf_conntrack_tcp_ignore_invalid_rst", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = { .procname = "nf_conntrack_udp_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = { .procname = "nf_flowtable_udp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = { .procname = "nf_conntrack_icmpv6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_NF_CT_PROTO_SCTP [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = { .procname = "nf_conntrack_sctp_timeout_closed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = { .procname = "nf_conntrack_sctp_timeout_cookie_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = { .procname = "nf_conntrack_sctp_timeout_cookie_echoed", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = { .procname = "nf_conntrack_sctp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .procname = "nf_conntrack_sctp_timeout_shutdown_recd", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = { .procname = "nf_conntrack_gre_timeout_stream", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, #endif }; static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, struct ctl_table *table) { struct nf_tcp_net *tn = nf_tcp_pernet(net); #define XASSIGN(XNAME, tn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \ &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME] XASSIGN(SYN_SENT, tn); XASSIGN(SYN_RECV, tn); XASSIGN(ESTABLISHED, tn); XASSIGN(FIN_WAIT, tn); XASSIGN(CLOSE_WAIT, tn); XASSIGN(LAST_ACK, tn); XASSIGN(TIME_WAIT, tn); XASSIGN(CLOSE, tn); XASSIGN(RETRANS, tn); XASSIGN(UNACK, tn); #undef XASSIGN #define XASSIGN(XNAME, rval) \ table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval) XASSIGN(LOOSE, &tn->tcp_loose); XASSIGN(LIBERAL, &tn->tcp_be_liberal); XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans); XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst); #undef XASSIGN #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; #endif } static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net *sn = nf_sctp_pernet(net); #define XASSIGN(XNAME, sn) \ table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \ &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME] XASSIGN(CLOSED, sn); XASSIGN(COOKIE_WAIT, sn); XASSIGN(COOKIE_ECHOED, sn); XASSIGN(ESTABLISHED, sn); XASSIGN(SHUTDOWN_SENT, sn); XASSIGN(SHUTDOWN_RECD, sn); XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); #undef XASSIGN #endif } static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { #ifdef CONFIG_NF_CT_PROTO_GRE struct nf_gre_net *gn = nf_gre_pernet(net); table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED]; #endif } static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL); table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), GFP_KERNEL); if (!table) return -ENOMEM; table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp; #endif table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED]; table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; #endif nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_ct_sysctl_table)); if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; out_unregister_netfilter: kfree(table); return -ENOMEM; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else static int nf_conntrack_standalone_init_sysctl(struct net *net) { return 0; } static void nf_conntrack_standalone_fini_sysctl(struct net *net) { } #endif /* CONFIG_SYSCTL */ static void nf_conntrack_fini_net(struct net *net) { if (enable_hooks) nf_ct_netns_put(net, NFPROTO_INET); nf_conntrack_standalone_fini_proc(net); nf_conntrack_standalone_fini_sysctl(net); } static int nf_conntrack_pernet_init(struct net *net) { int ret; net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) return ret; ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; ret = nf_conntrack_init_net(net); if (ret < 0) goto out_init_net; if (enable_hooks) { ret = nf_ct_netns_get(net, NFPROTO_INET); if (ret < 0) goto out_hooks; } return 0; out_hooks: nf_conntrack_cleanup_net(net); out_init_net: nf_conntrack_standalone_fini_proc(net); out_proc: nf_conntrack_standalone_fini_sysctl(net); return ret; } static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) nf_conntrack_fini_net(net); nf_conntrack_cleanup_net_list(net_exit_list); } static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, .id = &nf_conntrack_net_id, .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) { int ret = nf_conntrack_init_start(); if (ret < 0) goto out_start; BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER); #ifdef CONFIG_SYSCTL nf_ct_netfilter_header = register_net_sysctl(&init_net, "net", nf_ct_netfilter_table); if (!nf_ct_netfilter_header) { pr_err("nf_conntrack: can't register to sysctl.\n"); ret = -ENOMEM; goto out_sysctl; } nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif nf_conntrack_init_end(); ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; return 0; out_pernet: #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); out_sysctl: #endif nf_conntrack_cleanup_end(); out_start: return ret; } static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_cleanup_start(); unregister_pernet_subsys(&nf_conntrack_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(nf_ct_netfilter_header); #endif nf_conntrack_cleanup_end(); } module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini); |
| 609 313 293 70 36 36 595 205 185 28 508 540 538 539 559 4 530 77 21 110 340 539 542 438 37 105 407 2 51 540 1 519 210 540 540 1 540 521 521 8 520 522 110 109 110 110 110 110 110 110 2908 1923 1165 16 2 565 568 565 563 566 385 6 706 3 3 1 2 381 52 1 50 1 522 398 275 255 514 1 139 253 136 235 20 112 98 204 1 1 202 513 52 2 50 521 4 2 2 1 2 9 1 8 9 1 111 108 108 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 | // SPDX-License-Identifier: GPL-2.0-or-later /* Basic authentication token and access key management * * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/poison.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/workqueue.h> #include <linux/random.h> #include <linux/err.h> #include "internal.h" struct kmem_cache *key_jar; struct rb_root key_serial_tree; /* tree of keys indexed by serial */ DEFINE_SPINLOCK(key_serial_lock); struct rb_root key_user_tree; /* tree of quota records indexed by UID */ DEFINE_SPINLOCK(key_user_lock); unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */ unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */ unsigned int key_quota_maxkeys = 200; /* general key count quota */ unsigned int key_quota_maxbytes = 20000; /* general key space quota */ static LIST_HEAD(key_types_list); static DECLARE_RWSEM(key_types_sem); /* We serialise key instantiation and link */ DEFINE_MUTEX(key_construction_mutex); #ifdef KEY_DEBUGGING void __key_check(const struct key *key) { printk("__key_check: key %p {%08x} should be {%08x}\n", key, key->magic, KEY_DEBUG_MAGIC); BUG(); } #endif /* * Get the key quota record for a user, allocating a new record if one doesn't * already exist. */ struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; struct rb_node *parent, **p; try_again: parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); /* search the tree for a user record with a matching UID */ while (*p) { parent = *p; user = rb_entry(parent, struct key_user, node); if (uid_lt(uid, user->uid)) p = &(*p)->rb_left; else if (uid_gt(uid, user->uid)) p = &(*p)->rb_right; else goto found; } /* if we get here, we failed to find a match in the tree */ if (!candidate) { /* allocate a candidate user record if we don't already have * one */ spin_unlock(&key_user_lock); user = NULL; candidate = kmalloc(sizeof(struct key_user), GFP_KERNEL); if (unlikely(!candidate)) goto out; /* the allocation may have scheduled, so we need to repeat the * search lest someone else added the record whilst we were * asleep */ goto try_again; } /* if we get here, then the user record still hadn't appeared on the * second pass - so we use the candidate record */ refcount_set(&candidate->usage, 1); atomic_set(&candidate->nkeys, 0); atomic_set(&candidate->nikeys, 0); candidate->uid = uid; candidate->qnkeys = 0; candidate->qnbytes = 0; spin_lock_init(&candidate->lock); mutex_init(&candidate->cons_lock); rb_link_node(&candidate->node, parent, p); rb_insert_color(&candidate->node, &key_user_tree); spin_unlock(&key_user_lock); user = candidate; goto out; /* okay - we found a user record for this UID */ found: refcount_inc(&user->usage); spin_unlock(&key_user_lock); kfree(candidate); out: return user; } /* * Dispose of a user structure */ void key_user_put(struct key_user *user) { if (refcount_dec_and_lock(&user->usage, &key_user_lock)) { rb_erase(&user->node, &key_user_tree); spin_unlock(&key_user_lock); kfree(user); } } /* * Allocate a serial number for a key. These are assigned randomly to avoid * security issues through covert channel problems. */ static inline void key_alloc_serial(struct key *key) { struct rb_node *parent, **p; struct key *xkey; /* propose a random serial number and look for a hole for it in the * serial number tree */ do { get_random_bytes(&key->serial, sizeof(key->serial)); key->serial >>= 1; /* negative numbers are not permitted */ } while (key->serial < 3); spin_lock(&key_serial_lock); attempt_insertion: parent = NULL; p = &key_serial_tree.rb_node; while (*p) { parent = *p; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) p = &(*p)->rb_left; else if (key->serial > xkey->serial) p = &(*p)->rb_right; else goto serial_exists; } /* we've found a suitable hole - arrange for this key to occupy it */ rb_link_node(&key->serial_node, parent, p); rb_insert_color(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); return; /* we found a key with the proposed serial number - walk the tree from * that point looking for the next unused serial number */ serial_exists: for (;;) { key->serial++; if (key->serial < 3) { key->serial = 3; goto attempt_insertion; } parent = rb_next(parent); if (!parent) goto attempt_insertion; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) goto attempt_insertion; } } /** * key_alloc - Allocate a key of the specified type. * @type: The type of key to allocate. * @desc: The key description to allow the key to be searched out. * @uid: The owner of the new key. * @gid: The group ID for the new key's group permissions. * @cred: The credentials specifying UID namespace. * @perm: The permissions mask of the new key. * @flags: Flags specifying quota properties. * @restrict_link: Optional link restriction for new keyrings. * * Allocate a key of the specified type with the attributes given. The key is * returned in an uninstantiated state and the caller needs to instantiate the * key before returning. * * The restrict_link structure (if not NULL) will be freed when the * keyring is destroyed, so it must be dynamically allocated. * * The user's key count quota is updated to reflect the creation of the key and * the user's key data quota has the default for the key type reserved. The * instantiation function should amend this as necessary. If insufficient * quota is available, -EDQUOT will be returned. * * The LSM security modules can prevent a key being created, in which case * -EACCES will be returned. * * Returns a pointer to the new key if successful and an error code otherwise. * * Note that the caller needs to ensure the key type isn't uninstantiated. * Internally this can be done by locking key_types_sem. Externally, this can * be done by either never unregistering the key type, or making sure * key_alloc() calls don't race with module unloading. */ struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link) { struct key_user *user = NULL; struct key *key; size_t desclen, quotalen; int ret; unsigned long irqflags; key = ERR_PTR(-EINVAL); if (!desc || !*desc) goto error; if (type->vet_description) { ret = type->vet_description(desc); if (ret < 0) { key = ERR_PTR(ret); goto error; } } desclen = strlen(desc); quotalen = desclen + 1 + type->def_datalen; /* get hold of the key tracking for this user */ user = key_user_lookup(uid); if (!user) goto no_memory_1; /* check that the user's quota permits allocation of another key and * its description */ if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxkeys : key_quota_maxkeys; unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; spin_lock_irqsave(&user->lock, irqflags); if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) { if (user->qnkeys + 1 > maxkeys || user->qnbytes + quotalen > maxbytes || user->qnbytes + quotalen < user->qnbytes) goto no_quota; } user->qnkeys++; user->qnbytes += quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } /* allocate and initialise the key and its description */ key = kmem_cache_zalloc(key_jar, GFP_KERNEL); if (!key) goto no_memory_2; key->index_key.desc_len = desclen; key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL); if (!key->index_key.description) goto no_memory_3; key->index_key.type = type; key_set_index_key(&key->index_key); refcount_set(&key->usage, 1); init_rwsem(&key->sem); lockdep_set_class(&key->sem, &type->lock_class); key->user = user; key->quotalen = quotalen; key->datalen = type->def_datalen; key->uid = uid; key->gid = gid; key->perm = perm; key->expiry = TIME64_MAX; key->restrict_link = restrict_link; key->last_used_at = ktime_get_real_seconds(); key->flags |= 1 << KEY_FLAG_USER_ALIVE; if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; if (flags & KEY_ALLOC_UID_KEYRING) key->flags |= 1 << KEY_FLAG_UID_KEYRING; if (flags & KEY_ALLOC_SET_KEEP) key->flags |= 1 << KEY_FLAG_KEEP; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; #endif /* let the security module know about the key */ ret = security_key_alloc(key, cred, flags); if (ret < 0) goto security_error; /* publish the key by giving it a serial number */ refcount_inc(&key->domain_tag->usage); atomic_inc(&user->nkeys); key_alloc_serial(key); error: return key; security_error: kfree(key->description); kmem_cache_free(key_jar, key); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock_irqsave(&user->lock, irqflags); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } key_user_put(user); key = ERR_PTR(ret); goto error; no_memory_3: kmem_cache_free(key_jar, key); no_memory_2: if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock_irqsave(&user->lock, irqflags); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock_irqrestore(&user->lock, irqflags); } key_user_put(user); no_memory_1: key = ERR_PTR(-ENOMEM); goto error; no_quota: spin_unlock_irqrestore(&user->lock, irqflags); key_user_put(user); key = ERR_PTR(-EDQUOT); goto error; } EXPORT_SYMBOL(key_alloc); /** * key_payload_reserve - Adjust data quota reservation for the key's payload * @key: The key to make the reservation for. * @datalen: The amount of data payload the caller now wants. * * Adjust the amount of the owning user's key data quota that a key reserves. * If the amount is increased, then -EDQUOT may be returned if there isn't * enough free quota available. * * If successful, 0 is returned. */ int key_payload_reserve(struct key *key, size_t datalen) { int delta = (int)datalen - key->datalen; int ret = 0; key_check(key); /* contemplate the quota adjustment */ if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { unsigned maxbytes = uid_eq(key->user->uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; unsigned long flags; spin_lock_irqsave(&key->user->lock, flags); if (delta > 0 && (key->user->qnbytes + delta > maxbytes || key->user->qnbytes + delta < key->user->qnbytes)) { ret = -EDQUOT; } else { key->user->qnbytes += delta; key->quotalen += delta; } spin_unlock_irqrestore(&key->user->lock, flags); } /* change the recorded data length if that didn't generate an error */ if (ret == 0) key->datalen = datalen; return ret; } EXPORT_SYMBOL(key_payload_reserve); /* * Change the key state to being instantiated. */ static void mark_key_instantiated(struct key *key, int reject_error) { /* Commit the payload before setting the state; barrier versus * key_read_state(). */ smp_store_release(&key->state, (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); } /* * Instantiate a key and link it into the target keyring atomically. Must be * called with the target keyring's semaphore writelocked. The target key's * semaphore need not be locked as instantiation is serialised by * key_construction_mutex. */ static int __key_instantiate_and_link(struct key *key, struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, struct assoc_array_edit **_edit) { int ret, awaken; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* instantiate the key */ ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; /* and link it into the destination keyring */ if (keyring) { if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); __key_link(keyring, key, _edit); } /* disable the authorisation key */ if (authkey) key_invalidate(authkey); if (prep->expiry != TIME64_MAX) key_set_expiry(key, prep->expiry); } } mutex_unlock(&key_construction_mutex); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret; } /** * key_instantiate_and_link - Instantiate a key and link it into the keyring. * @key: The key to instantiate. * @data: The data to use to instantiate the keyring. * @datalen: The length of @data. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Instantiate a key that's in the uninstantiated state using the provided data * and, if successful, link it in to the destination keyring if one is * supplied. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey) { struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; int ret; memset(&prep, 0, sizeof(prep)); prep.orig_description = key->description; prep.data = data; prep.datalen = datalen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } if (keyring) { ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_link_end; if (keyring->restrict_link && keyring->restrict_link->check) { struct key_restriction *keyres = keyring->restrict_link; ret = keyres->check(keyring, key->type, &prep.payload, keyres->key); if (ret < 0) goto error_link_end; } } ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit); error_link_end: if (keyring) __key_link_end(keyring, &key->index_key, edit); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_instantiate_and_link); /** * key_reject_and_link - Negatively instantiate a key and link it into the keyring. * @key: The key to instantiate. * @timeout: The timeout on the negative key. * @error: The error to return when the key is hit. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Negatively instantiate a key that's in the uninstantiated state and, if * successful, set its timeout and stored error and link it in to the * destination keyring if one is supplied. The key and any links to the key * will be automatically garbage collected after the timeout expires. * * Negative keys are used to rate limit repeated request_key() calls by causing * them to return the stored error code (typically ENOKEY) until the negative * key expires. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey) { struct assoc_array_edit *edit = NULL; int ret, awaken, link_ret = 0; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; if (keyring) { if (keyring->restrict_link) return -EPERM; link_ret = __key_link_lock(keyring, &key->index_key); if (link_ret == 0) { link_ret = __key_link_begin(keyring, &key->index_key, &edit); if (link_ret < 0) __key_link_end(keyring, &key->index_key, edit); } } mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key_set_expiry(key, ktime_get_real_seconds() + timeout); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; ret = 0; /* and link it into the destination keyring */ if (keyring && link_ret == 0) __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) key_invalidate(authkey); } mutex_unlock(&key_construction_mutex); if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret == 0 ? link_ret : ret; } EXPORT_SYMBOL(key_reject_and_link); /** * key_put - Discard a reference to a key. * @key: The key to discard a reference from. * * Discard a reference to a key, and when all the references are gone, we * schedule the cleanup task to come and pull it out of the tree in process * context at some later time. */ void key_put(struct key *key) { if (key) { key_check(key); if (refcount_dec_and_test(&key->usage)) { unsigned long flags; /* deal with the user's key tracking and quota */ if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { spin_lock_irqsave(&key->user->lock, flags); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; spin_unlock_irqrestore(&key->user->lock, flags); } /* Mark key as safe for GC after key->user done. */ clear_bit_unlock(KEY_FLAG_USER_ALIVE, &key->flags); schedule_work(&key_gc_work); } } } EXPORT_SYMBOL(key_put); /* * Find a key by its serial number. */ struct key *key_lookup(key_serial_t id) { struct rb_node *n; struct key *key; spin_lock(&key_serial_lock); /* search the tree for the specified key */ n = key_serial_tree.rb_node; while (n) { key = rb_entry(n, struct key, serial_node); if (id < key->serial) n = n->rb_left; else if (id > key->serial) n = n->rb_right; else goto found; } not_found: key = ERR_PTR(-ENOKEY); goto error; found: /* A key is allowed to be looked up only if someone still owns a * reference to it - otherwise it's awaiting the gc. */ if (!refcount_inc_not_zero(&key->usage)) goto not_found; error: spin_unlock(&key_serial_lock); return key; } EXPORT_SYMBOL(key_lookup); /* * Find and lock the specified key type against removal. * * We return with the sem read-locked if successful. If the type wasn't * available -ENOKEY is returned instead. */ struct key_type *key_type_lookup(const char *type) { struct key_type *ktype; down_read(&key_types_sem); /* look up the key type to see if it's one of the registered kernel * types */ list_for_each_entry(ktype, &key_types_list, link) { if (strcmp(ktype->name, type) == 0) goto found_kernel_type; } up_read(&key_types_sem); ktype = ERR_PTR(-ENOKEY); found_kernel_type: return ktype; } void key_set_timeout(struct key *key, unsigned timeout) { time64_t expiry = TIME64_MAX; /* make the changes with the locks held to prevent races */ down_write(&key->sem); if (timeout > 0) expiry = ktime_get_real_seconds() + timeout; key_set_expiry(key, expiry); up_write(&key->sem); } EXPORT_SYMBOL_GPL(key_set_timeout); /* * Unlock a key type locked by key_type_lookup(). */ void key_type_put(struct key_type *ktype) { up_read(&key_types_sem); } /* * Attempt to update an existing key. * * The key is given to us with an incremented refcount that we need to discard * if we get an error. */ static inline key_ref_t __key_update(key_ref_t key_ref, struct key_preparsed_payload *prep) { struct key *key = key_ref_to_ptr(key_ref); int ret; /* need write permission on the key to update it */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) goto error; ret = -EEXIST; if (!key->type->update) goto error; down_write(&key->sem); ret = key->type->update(key, prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); if (ret < 0) goto error; out: return key_ref; error: key_put(key); key_ref = ERR_PTR(ret); goto out; } /* * Create or potentially update a key. The combined logic behind * key_create_or_update() and key_create() */ static key_ref_t __key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags, bool allow_update) { struct keyring_index_key index_key = { .description = description, }; struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; const struct cred *cred = current_cred(); struct key *keyring, *key = NULL; key_ref_t key_ref; int ret; struct key_restriction *restrict_link = NULL; /* look up the key type to see if it's one of the registered kernel * types */ index_key.type = key_type_lookup(type); if (IS_ERR(index_key.type)) { key_ref = ERR_PTR(-ENODEV); goto error; } key_ref = ERR_PTR(-EINVAL); if (!index_key.type->instantiate || (!index_key.description && !index_key.type->preparse)) goto error_put_type; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION)) restrict_link = keyring->restrict_link; key_ref = ERR_PTR(-ENOTDIR); if (keyring->type != &key_type_keyring) goto error_put_type; memset(&prep, 0, sizeof(prep)); prep.orig_description = description; prep.data = payload; prep.datalen = plen; prep.quotalen = index_key.type->def_datalen; prep.expiry = TIME64_MAX; if (index_key.type->preparse) { ret = index_key.type->preparse(&prep); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } if (!index_key.description) index_key.description = prep.description; key_ref = ERR_PTR(-EINVAL); if (!index_key.description) goto error_free_prep; } index_key.desc_len = strlen(index_key.description); key_set_index_key(&index_key); ret = __key_link_lock(keyring, &index_key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } if (restrict_link && restrict_link->check) { ret = restrict_link->check(keyring, index_key.type, &prep.payload, restrict_link->key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } } /* if we're going to allocate a new key, we're going to have * to modify the keyring */ ret = key_permission(keyring_ref, KEY_NEED_WRITE); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } /* if it's requested and possible to update this type of key, search * for an existing key of the same type and description in the * destination keyring and update that instead if possible */ if (allow_update) { if (index_key.type->update) { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) goto found_matching_key; } } else { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) { key_ref_put(key_ref); key_ref = ERR_PTR(-EEXIST); goto error_link_end; } } /* if the client doesn't provide, decide on the permissions we want */ if (perm == KEY_PERM_UNDEF) { perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (index_key.type->read) perm |= KEY_POS_READ; if (index_key.type == &key_type_keyring || index_key.type->update) perm |= KEY_POS_WRITE; } /* allocate a new key */ key = key_alloc(index_key.type, index_key.description, cred->fsuid, cred->fsgid, cred, perm, flags, NULL); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; } /* instantiate it and link it into the target keyring */ ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); goto error_link_end; } security_key_post_create_or_update(keyring, key, payload, plen, flags, true); key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); error_link_end: __key_link_end(keyring, &index_key, edit); error_free_prep: if (index_key.type->preparse) index_key.type->free_preparse(&prep); error_put_type: key_type_put(index_key.type); error: return key_ref; found_matching_key: /* we found a matching key, so we're going to try to update it * - we can drop the locks first as we have the key pinned */ __key_link_end(keyring, &index_key, edit); key = key_ref_to_ptr(key_ref); if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) { ret = wait_for_key_construction(key, true); if (ret < 0) { key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error_free_prep; } } key_ref = __key_update(key_ref, &prep); if (!IS_ERR(key_ref)) security_key_post_create_or_update(keyring, key, payload, plen, flags, false); goto error_free_prep; } /** * key_create_or_update - Update or create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Search the destination keyring for a key of the same description and if one * is found, update it, otherwise create and instantiate a new one and create a * link to it from that keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -ENODEV if the key type * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the * caller isn't permitted to modify the keyring or the LSM did not permit * creation of the key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, true); } EXPORT_SYMBOL(key_create_or_update); /** * key_create - Create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Create and instantiate a new key and link to it from the destination keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -EEXIST if a key with the * same description already exists, -ENODEV if the key type wasn't available, * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't * permitted to modify the keyring or the LSM did not permit creation of the * key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, false); } EXPORT_SYMBOL(key_create); /** * key_update - Update a key's contents. * @key_ref: The pointer (plus possession flag) to the key. * @payload: The data to be used to update the key. * @plen: The length of @payload. * * Attempt to update the contents of a key with the given payload data. The * caller must be granted Write permission on the key. Negative keys can be * instantiated by this method. * * Returns 0 on success, -EACCES if not permitted and -EOPNOTSUPP if the key * type does not support updating. The key type may return other errors. */ int key_update(key_ref_t key_ref, const void *payload, size_t plen) { struct key_preparsed_payload prep; struct key *key = key_ref_to_ptr(key_ref); int ret; key_check(key); /* the key must be writable */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) return ret; /* attempt to update it if supported */ if (!key->type->update) return -EOPNOTSUPP; memset(&prep, 0, sizeof(prep)); prep.data = payload; prep.datalen = plen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } down_write(&key->sem); ret = key->type->update(key, &prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_update); /** * key_revoke - Revoke a key. * @key: The key to be revoked. * * Mark a key as being revoked and ask the type to free up its resources. The * revocation timeout is set and the key and all its links will be * automatically garbage collected after key_gc_delay amount of time if they * are not manually dealt with first. */ void key_revoke(struct key *key) { time64_t time; key_check(key); /* make sure no one's trying to change or use the key when we mark it * - we tell lockdep that we might nest because we might be revoking an * authorisation key whilst holding the sem on a key we've just * instantiated */ down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { notify_key(key, NOTIFY_KEY_REVOKED, 0); if (key->type->revoke) key->type->revoke(key); /* set the death time to no more than the expiry time */ time = ktime_get_real_seconds(); if (key->revoked_at == 0 || key->revoked_at > time) { key->revoked_at = time; key_schedule_gc(key->revoked_at + key_gc_delay); } } up_write(&key->sem); } EXPORT_SYMBOL(key_revoke); /** * key_invalidate - Invalidate a key. * @key: The key to be invalidated. * * Mark a key as being invalidated and have it cleaned up immediately. The key * is ignored by all searches and other operations from this point. */ void key_invalidate(struct key *key) { kenter("%d", key_serial(key)); key_check(key); if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); } up_write(&key->sem); } } EXPORT_SYMBOL(key_invalidate); /** * generic_key_instantiate - Simple instantiation of a key from preparsed data * @key: The key to be instantiated * @prep: The preparsed data to load. * * Instantiate a key from preparsed data. We assume we can just copy the data * in directly and clear the old pointers. * * This can be pointed to directly by the key type instantiate op pointer. */ int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { int ret; pr_devel("==>%s()\n", __func__); ret = key_payload_reserve(key, prep->quotalen); if (ret == 0) { rcu_assign_keypointer(key, prep->payload.data[0]); key->payload.data[1] = prep->payload.data[1]; key->payload.data[2] = prep->payload.data[2]; key->payload.data[3] = prep->payload.data[3]; prep->payload.data[0] = NULL; prep->payload.data[1] = NULL; prep->payload.data[2] = NULL; prep->payload.data[3] = NULL; } pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL(generic_key_instantiate); /** * register_key_type - Register a type of key. * @ktype: The new key type. * * Register a new key type. * * Returns 0 on success or -EEXIST if a type of this name already exists. */ int register_key_type(struct key_type *ktype) { struct key_type *p; int ret; memset(&ktype->lock_class, 0, sizeof(ktype->lock_class)); ret = -EEXIST; down_write(&key_types_sem); /* disallow key types with the same name */ list_for_each_entry(p, &key_types_list, link) { if (strcmp(p->name, ktype->name) == 0) goto out; } /* store the type */ list_add(&ktype->link, &key_types_list); pr_notice("Key type %s registered\n", ktype->name); ret = 0; out: up_write(&key_types_sem); return ret; } EXPORT_SYMBOL(register_key_type); /** * unregister_key_type - Unregister a type of key. * @ktype: The key type. * * Unregister a key type and mark all the extant keys of this type as dead. * Those keys of this type are then destroyed to get rid of their payloads and * they and their links will be garbage collected as soon as possible. */ void unregister_key_type(struct key_type *ktype) { down_write(&key_types_sem); list_del_init(&ktype->link); downgrade_write(&key_types_sem); key_gc_keytype(ktype); pr_notice("Key type %s unregistered\n", ktype->name); up_read(&key_types_sem); } EXPORT_SYMBOL(unregister_key_type); /* * Initialise the key management state. */ void __init key_init(void) { /* allocate a slab in which we can store keys */ key_jar = kmem_cache_create("key_jar", sizeof(struct key), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* add the special key types */ list_add_tail(&key_type_keyring.link, &key_types_list); list_add_tail(&key_type_dead.link, &key_types_list); list_add_tail(&key_type_user.link, &key_types_list); list_add_tail(&key_type_logon.link, &key_types_list); /* record the root user tracking */ rb_link_node(&root_key_user.node, NULL, &key_user_tree.rb_node); rb_insert_color(&root_key_user.node, &key_user_tree); } |
| 189 295 295 25 418 235 164 161 132 164 249 164 137 248 132 5 164 121 132 235 235 235 235 235 235 234 106 234 234 147 106 164 164 25 164 164 46 27 164 295 46 295 165 164 3 3 165 3 164 165 165 165 62 165 41 62 41 125 92 92 719 4 720 3 2 87 132 9448 9361 229 9442 135 114 87 135 135 132 132 1 3 2 135 133 135 134 135 135 135 340 342 258 6 6 91 88 88 88 54 54 9241 9241 25 25 25 1 25 25 178 88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: * The group->recnt and mark->refcnt tell how many "things" in the kernel * currently are referencing the objects. Both kind of objects typically will * live inside the kernel with a refcnt of 2, one for its creation and one for * the reference a group and a mark hold to each other. * If you are holding the appropriate locks, you can take a reference and the * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: * There are 3 locks involved with fsnotify inode marks and they MUST be taken * in order as follows: * * group->mark_mutex * mark->lock * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private * data (i.e group limits). * mark->lock protects the marks attributes like its masks and flags. * Furthermore it protects the access to a reference of the group that the mark * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * * mark->connector->lock protects the list of marks anchored inside an * inode / vfsmount and each mark is hooked via the i_list. * * A list of notification marks relating to inode / mnt is contained in * fsnotify_mark_connector. That structure is alive as long as there are any * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets * detached from fsnotify_mark_connector when last reference to the mark is * dropped. Thus having mark reference is enough to protect mark->connector * pointer and to make sure fsnotify_mark_connector cannot disappear. Also * because we remove mark from g_list before dropping mark reference associated * with that, any mark found through g_list is guaranteed to have * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) * - The inode is being evicted from cache. (fsnotify_inode_delete) * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) * - Something explicitly requests that it be removed. (fsnotify_destroy_mark) * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> #include <linux/ratelimit.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; struct kmem_cache *fsnotify_mark_connector_cachep; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); static void fsnotify_connector_destroy_workfn(struct work_struct *work); static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { WARN_ON_ONCE(!refcount_read(&mark->refcnt)); refcount_inc(&mark->refcnt); } static fsnotify_connp_t *fsnotify_object_connp(void *obj, enum fsnotify_obj_type obj_type) { switch (obj_type) { case FSNOTIFY_OBJ_TYPE_INODE: return &((struct inode *)obj)->i_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_VFSMOUNT: return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); case FSNOTIFY_OBJ_TYPE_MNTNS: return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } } static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) { if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn) { if (WARN_ON(!fsnotify_valid_obj_type(conn->type))) return 0; return READ_ONCE(*fsnotify_conn_mask_p(conn)); } static void fsnotify_get_sb_watched_objects(struct super_block *sb) { atomic_long_inc(fsnotify_sb_watched_objects(sb)); } static void fsnotify_put_sb_watched_objects(struct super_block *sb) { atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb); /* the superblock can go away after this decrement */ if (atomic_long_dec_and_test(watched_objects)) wake_up_var(watched_objects); } static void fsnotify_get_inode_ref(struct inode *inode) { ihold(inode); fsnotify_get_sb_watched_objects(inode->i_sb); } static void fsnotify_put_inode_ref(struct inode *inode) { /* read ->i_sb before the inode can go away */ struct super_block *sb = inode->i_sb; iput(inode); fsnotify_put_sb_watched_objects(sb); } /* * Grab or drop watched objects reference depending on whether the connector * is attached and has any marks attached. */ static void fsnotify_update_sb_watchers(struct super_block *sb, struct fsnotify_mark_connector *conn) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED; struct fsnotify_mark *first_mark = NULL; unsigned int highest_prio = 0; if (conn->obj) first_mark = hlist_entry_safe(conn->list.first, struct fsnotify_mark, obj_list); if (first_mark) highest_prio = first_mark->group->priority; if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM)) highest_prio = 0; /* * If the highest priority of group watching this object is prio, * then watched object has a reference on counters [0..prio]. * Update priority >= 1 watched objects counters. */ for (unsigned int p = conn->prio + 1; p <= highest_prio; p++) atomic_long_inc(&sbinfo->watched_objects[p]); for (unsigned int p = conn->prio; p > highest_prio; p--) atomic_long_dec(&sbinfo->watched_objects[p]); conn->prio = highest_prio; /* Update priority >= 0 (a.k.a total) watched objects counter */ BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0); if (first_mark && !is_watched) { conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_get_sb_watched_objects(sb); } else if (!first_mark && is_watched) { conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED; fsnotify_put_sb_watched_objects(sb); } } /* * Grab or drop inode reference for the connector if needed. * * When it's time to drop the reference, we only clear the HAS_IREF flag and * return the inode object. fsnotify_drop_object() will be resonsible for doing * iput() outside of spinlocks. This happens when last mark that wanted iref is * detached. */ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn, bool want_iref) { bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF; struct inode *inode = NULL; if (conn->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == has_iref) return NULL; if (want_iref) { /* Pin inode if any mark wants inode refcount held */ fsnotify_get_inode_ref(fsnotify_conn_inode(conn)); conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF; } else { /* Unpin inode after detach of last mark that wanted iref */ inode = fsnotify_conn_inode(conn); conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF; } return inode; } static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; bool want_iref = false; struct fsnotify_mark *mark; assert_spin_locked(&conn->lock); /* We can get detached connector here when inode is getting unlinked. */ if (!fsnotify_valid_obj_type(conn->type)) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) continue; new_mask |= fsnotify_calc_mask(mark); if (conn->type == FSNOTIFY_OBJ_TYPE_INODE && !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) want_iref = true; } /* * We use WRITE_ONCE() to prevent silly compiler optimizations from * confusing readers not holding conn->lock with partial updates. */ WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask); return fsnotify_update_iref(conn, want_iref); } static bool fsnotify_conn_watches_children( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return false; return fsnotify_inode_watches_children(fsnotify_conn_inode(conn)); } static void fsnotify_conn_set_children_dentry_flags( struct fsnotify_mark_connector *conn) { if (conn->type != FSNOTIFY_OBJ_TYPE_INODE) return; fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn)); } /* * Calculate mask of events for a list of marks. The caller must make sure * connector and connector->obj cannot disappear under us. Callers achieve * this by holding a mark->lock or mark->group->mark_mutex for a mark on this * list. */ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { bool update_children; if (!conn) return; spin_lock(&conn->lock); update_children = !fsnotify_conn_watches_children(conn); __fsnotify_recalc_mask(conn); update_children &= fsnotify_conn_watches_children(conn); spin_unlock(&conn->lock); /* * Set children's PARENT_WATCHED flags only if parent started watching. * When parent stops watching, we clear false positive PARENT_WATCHED * flags lazily in __fsnotify_parent(). */ if (update_children) fsnotify_conn_set_children_dentry_flags(conn); } /* Free all connectors queued for freeing once SRCU period ends */ static void fsnotify_connector_destroy_workfn(struct work_struct *work) { struct fsnotify_mark_connector *conn, *free; spin_lock(&destroy_lock); conn = connector_destroy_list; connector_destroy_list = NULL; spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); while (conn) { free = conn; conn = conn->destroy_next; kmem_cache_free(fsnotify_mark_connector_cachep, free); } } static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) { fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type); struct super_block *sb = fsnotify_connector_sb(conn); struct inode *inode = NULL; *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; /* Unpin inode when detaching from connector */ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF)) inode = NULL; } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; if (sb) fsnotify_update_sb_watchers(sb, conn); return inode; } static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; if (WARN_ON_ONCE(!group)) return; group->ops->free_mark(mark); fsnotify_put_group(group); } /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; } /* * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { struct super_block *sb = fsnotify_connector_sb(conn); /* Update watched objects after detaching mark */ if (sb) fsnotify_update_sb_watchers(sb, conn); objp = __fsnotify_recalc_mask(conn); type = conn->type; } WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object * list. Mark can be already removed from the list by now and on its way to be * destroyed once SRCU period ends. * * Also pin the group so it doesn't disappear under us. */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { if (!mark) return true; if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { /* mark is attached, group is still alive then */ atomic_inc(&mark->group->user_waits); spin_unlock(&mark->lock); return true; } spin_unlock(&mark->lock); fsnotify_put_mark(mark); } return false; } /* * Puts marks and wakes up group destruction if necessary. * * Pairs with fsnotify_get_mark_safe() */ static void fsnotify_put_mark_wake(struct fsnotify_mark *mark) { if (mark) { struct fsnotify_group *group = mark->group; fsnotify_put_mark(mark); /* * We abuse notification_waitq on group shutdown for waiting for * all marks pinned when waiting for userspace. */ if (atomic_dec_and_test(&group->user_waits) && group->shutdown) wake_up(&group->notification_waitq); } } bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) __releases(&fsnotify_mark_srcu) { int type; fsnotify_foreach_iter_type(type) { /* This can fail if mark is being removed */ if (!fsnotify_get_mark_safe(iter_info->marks[type])) { __release(&fsnotify_mark_srcu); goto fail; } } /* * Now that both marks are pinned by refcount in the inode / vfsmount * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); return true; fail: for (type--; type >= 0; type--) fsnotify_put_mark_wake(iter_info->marks[type]); return false; } void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) __acquires(&fsnotify_mark_srcu) { int type; iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); fsnotify_foreach_iter_type(type) fsnotify_put_mark_wake(iter_info->marks[type]); } /* * Mark mark as detached, remove it from group list. Mark still stays in object * list until its last reference is dropped. Note that we rely on mark being * removed from group list before corresponding reference to it is dropped. In * particular we rely on mark->connector being valid while we hold * group->mark_mutex if we found the mark through g_list. * * Must be called with group->mark_mutex held. The caller must either hold * reference to the mark or be protected by fsnotify_mark_srcu. */ void fsnotify_detach_mark(struct fsnotify_mark *mark) { fsnotify_group_assert_locked(mark->group); WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && refcount_read(&mark->refcnt) < 1 + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; list_del_init(&mark->g_list); spin_unlock(&mark->lock); /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } /* * Free fsnotify mark. The mark is actually only marked as being freed. The * freeing is actually happening only once last reference to the mark is * dropped from a workqueue which first waits for srcu period end. * * Caller must have a reference to the mark or be protected by * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; spin_lock(&mark->lock); /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark * is being freed. */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); } void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { fsnotify_group_lock(group); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); } EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. * * Fanotify supports different notification classes (reflected as priority of * notification group). Events shall be passed to notification groups in * decreasing priority order. To achieve this marks in notification lists for * inodes and vfsmounts are sorted so that priorities of corresponding groups * are descending. * * Furthermore correct handling of the ignore mask requires processing inode * and vfsmount marks of each group together. Using the group address as * further sort criterion provides a unique sorting order and thus we can * merge inode and vfsmount lists of marks in linear time and find groups * present in both lists. * * A return value of 1 signifies that b has priority over a. * A return value of 0 signifies that the two marks have to be handled together. * A return value of -1 signifies that a has priority over b. */ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) { if (a == b) return 0; if (!a) return 1; if (!b) return -1; if (a->priority < b->priority) return 1; if (a->priority > b->priority) return -1; if (a < b) return 1; return -1; } static int fsnotify_attach_info_to_sb(struct super_block *sb) { struct fsnotify_sb_info *sbinfo; /* sb info is freed on fsnotify_sb_delete() */ sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL); if (!sbinfo) return -ENOMEM; /* * cmpxchg() provides the barrier so that callers of fsnotify_sb_info() * will observe an initialized structure */ if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) { /* Someone else created sbinfo for us */ kfree(sbinfo); } return 0; } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, void *obj, unsigned int obj_type) { struct fsnotify_mark_connector *conn; conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); if (!conn) return -ENOMEM; spin_lock_init(&conn->lock); INIT_HLIST_HEAD(&conn->list); conn->flags = 0; conn->prio = 0; conn->type = obj_type; conn->obj = obj; /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure */ if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ kmem_cache_free(fsnotify_mark_connector_cachep, conn); } return 0; } /* * Get mark connector, make sure it is alive and return with its lock held. * This is for users that get connector pointer from inode or mount. Users that * hold reference to a mark on the list may directly lock connector->lock as * they are sure list cannot go away under them. */ static struct fsnotify_mark_connector *fsnotify_grab_connector( fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; int idx; idx = srcu_read_lock(&fsnotify_mark_srcu); conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (!conn) goto out; spin_lock(&conn->lock); if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) { spin_unlock(&conn->lock); srcu_read_unlock(&fsnotify_mark_srcu, idx); return NULL; } out: srcu_read_unlock(&fsnotify_mark_srcu, idx); return conn; } /* * Add mark into proper place in given list of marks. These marks may be used * for the fsnotify backend to determine which event types should be delivered * to which group and for which inodes. These marks are ordered according to * priority, highest number first, and then by the group's location in memory. */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct super_block *sb = fsnotify_object_sb(obj, obj_type); struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; fsnotify_connp_t *connp; int cmp; int err = 0; if (WARN_ON(!fsnotify_valid_obj_type(obj_type))) return -EINVAL; /* * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; } connp = fsnotify_object_connp(obj, obj_type); restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); err = fsnotify_attach_connector_to_object(connp, obj, obj_type); if (err) return err; goto restart; } /* is mark the first mark? */ if (hlist_empty(&conn->list)) { hlist_add_head_rcu(&mark->obj_list, &conn->list); goto added; } /* should mark be in the middle of the current list? */ hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; if ((lmark->group == mark->group) && (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) { err = -EEXIST; goto out_err; } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: if (sb) fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone * seeing mark->connector set. */ WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); return err; } /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { struct fsnotify_group *group = mark->group; int ret = 0; fsnotify_group_assert_locked(group); /* * LOCKING ORDER!!!! * group->mark_mutex * mark->lock * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags); if (ret) goto err; fsnotify_recalc_mask(mark->connector); return ret; err: spin_lock(&mark->lock); mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); fsnotify_put_mark(mark); return ret; } int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags) { int ret; struct fsnotify_group *group = mark->group; fsnotify_group_lock(group); ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags); fsnotify_group_unlock(group); return ret; } EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group) { fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type); struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; if (!connp) return NULL; conn = fsnotify_grab_connector(connp); if (!conn) return NULL; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->group == group && (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); return mark; } } spin_unlock(&conn->lock); return NULL; } EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); struct list_head *head = &to_free; /* Skip selection step if we want to clear all marks. */ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) { head = &group->marks_list; goto clear; } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our * to_free list so we have to use mark_mutex even when accessing that * list. And freeing mark requires us to drop mark_mutex. So we can * reliably free only the first mark in the list. That's why we first * move marks to free to to_free list in one go and then free marks in * to_free list one by one. */ fsnotify_group_lock(group); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { if (mark->connector->type == obj_type) list_move(&mark->g_list, &to_free); } fsnotify_group_unlock(group); clear: while (1) { fsnotify_group_lock(group); if (list_empty(head)) { fsnotify_group_unlock(group); break; } mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); fsnotify_group_unlock(group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); } } /* Destroy all marks attached to an object via connector */ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; void *objp; unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) return; /* * We have to be careful since we can race with e.g. * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the * list can get modified. However we are holding mark reference and * thus our mark cannot be removed from obj_list so we can continue * iteration after regaining conn->lock. */ hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); old_mark = mark; fsnotify_destroy_mark(mark, mark->group); spin_lock(&conn->lock); } /* * Detach list from object now so that we don't pin inode until all * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; WRITE_ONCE(mark->connector, NULL); } EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; spin_lock(&destroy_lock); /* exchange the list head */ list_replace_init(&destroy_list, &private_destroy_list); spin_unlock(&destroy_lock); synchronize_srcu(&fsnotify_mark_srcu); list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); fsnotify_final_mark_destroy(mark); } } /* Wait for all marks queued for destruction to be actually destroyed */ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); |
| 39 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM jbd2 #if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_JBD2_H #include <linux/jbd2.h> #include <linux/tracepoint.h> struct transaction_chp_stats_s; struct transaction_run_stats_s; TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), TP_ARGS(journal, result), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, result ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->result = result; ), TP_printk("dev %d,%d result %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), TP_printk("dev %d,%d transaction %u sync %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_locking, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_logging, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); TRACE_EVENT(jbd2_end_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) __field( tid_t, head ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), TP_printk("dev %d,%d transaction %u sync %d head %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); DECLARE_EVENT_CLASS(jbd2_handle_start_class, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); TRACE_EVENT(jbd2_handle_extend, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int buffer_credits, int requested_blocks), TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, buffer_credits ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->buffer_credits = buffer_credits; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "buffer_credits %d requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->buffer_credits, __entry->requested_blocks) ); TRACE_EVENT(jbd2_handle_stats, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int interval, int sync, int requested_blocks, int dirtied_blocks), TP_ARGS(dev, tid, type, line_no, interval, sync, requested_blocks, dirtied_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, interval ) __field( int, sync ) __field( int, requested_blocks) __field( int, dirtied_blocks ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->interval = interval; __entry->sync = sync; __entry->requested_blocks = requested_blocks; __entry->dirtied_blocks = dirtied_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u interval %d " "sync %d requested_blocks %d dirtied_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->interval, __entry->sync, __entry->requested_blocks, __entry->dirtied_blocks) ); TRACE_EVENT(jbd2_run_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_run_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, wait ) __field( unsigned long, request_delay ) __field( unsigned long, running ) __field( unsigned long, locked ) __field( unsigned long, flushing ) __field( unsigned long, logging ) __field( __u32, handle_count ) __field( __u32, blocks ) __field( __u32, blocks_logged ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->request_delay = stats->rs_request_delay; __entry->running = stats->rs_running; __entry->locked = stats->rs_locked; __entry->flushing = stats->rs_flushing; __entry->logging = stats->rs_logging; __entry->handle_count = stats->rs_handle_count; __entry->blocks = stats->rs_blocks; __entry->blocks_logged = stats->rs_blocks_logged; ), TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u " "locked %u flushing %u logging %u handle_count %u " "blocks %u blocks_logged %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->request_delay), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), jiffies_to_msecs(__entry->flushing), jiffies_to_msecs(__entry->logging), __entry->handle_count, __entry->blocks, __entry->blocks_logged) ); TRACE_EVENT(jbd2_checkpoint_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_chp_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) __field( __u32, written ) __field( __u32, dropped ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; __entry->written = stats->cs_written; __entry->dropped = stats->cs_dropped; ), TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u " "written %u dropped %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); TRACE_EVENT(jbd2_update_log_tail, TP_PROTO(journal_t *journal, tid_t first_tid, unsigned long block_nr, unsigned long freed), TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) __field(unsigned long, freed ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tail_sequence, __entry->first_tid, __entry->block_nr, __entry->freed) ); TRACE_EVENT(jbd2_write_superblock, TP_PROTO(journal_t *journal, blk_opf_t write_flags), TP_ARGS(journal, write_flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( blk_opf_t, write_flags ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->write_flags = write_flags; ), TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev), MINOR(__entry->dev), (__force u32)__entry->write_flags) ); TRACE_EVENT(jbd2_lock_buffer_stall, TP_PROTO(dev_t dev, unsigned long stall_ms), TP_ARGS(dev, stall_ms), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, stall_ms ) ), TP_fast_assign( __entry->dev = dev; __entry->stall_ms = stall_ms; ), TP_printk("dev %d,%d stall_ms %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->stall_ms) ); DECLARE_EVENT_CLASS(jbd2_journal_shrink, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); TRACE_EVENT(jbd2_shrink_scan_exit, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long nr_shrunk, unsigned long count), TP_ARGS(journal, nr_to_scan, nr_shrunk, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, nr_shrunk) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->nr_shrunk = nr_shrunk; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->nr_shrunk, __entry->count) ); TRACE_EVENT(jbd2_shrink_checkpoint_list, TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, unsigned long nr_freed, tid_t next_tid), TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, first_tid) __field(tid_t, tid) __field(tid_t, last_tid) __field(unsigned long, nr_freed) __field(tid_t, next_tid) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->first_tid = first_tid; __entry->tid = tid; __entry->last_tid = last_tid; __entry->nr_freed = nr_freed; __entry->next_tid = next_tid; ), TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " "next transaction %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_tid, __entry->tid, __entry->last_tid, __entry->nr_freed, __entry->next_tid) ); #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 3391 3589 1207 1208 1208 16 2804 227 1135 1209 4 6 21 1 11 14 2 1 61 24 149 146 145 146 3 119 1 132 72 119 89 673 674 671 669 4 675 27 1956 82 2366 2366 2362 1067 1067 1066 1065 1067 1067 2662 2661 942 748 748 2366 2367 2367 2364 2362 342 2207 516 411 1294 134 1818 45 45 45 45 45 45 45 623 582 177 1817 1822 1818 1815 1816 7 1818 2 1804 18 1810 45 1813 1151 10 508 1484 1809 2 1616 262 1811 1810 1806 10 3 13 1686 15 16 6 2 13 1 13 12 1 6 5 2 1 2367 2366 916 916 9 2365 2 9 8 1220 1218 1218 1222 1222 1221 1220 1220 88 15 13 12 15 83 573 656 752 657 657 271 13 13 83 660 659 279 657 752 2 11 8 303 303 1 85 222 73 150 124 88 136 15 2 13 9 54 25 35 12 13 1 35 18 28 19 24 4 17 5 4 27 19 19 7 12 19 15 4 13 4 4 15 31 279 24 300 3 239 136 301 82 1 2 2 1 1 1 1 952 1346 1350 36 952 976 976 980 979 740 741 739 740 494 248 30 2787 2788 2787 2789 2787 1718 1719 1720 40 212 16 16 2 2 5 7 10 1 2 2 2 1 248 247 2 7 2 227 9 6 3 231 236 155 79 1 5 2 225 1 1 190 1 1 191 4 1 129 2 2 147 75 148 73 24 1 24 24 1 1 1 1 3 1 8 1 1 1 2 1 1 2 2 24 23 24 2 2 1 24 24 24 10 1 1 2 6 5 1 2 2 3 2 30 22 8 9 5 25 24 24 24 19 24 1336 3 1 266 1232 1231 1338 1338 1 2 2 2 2 136 8 2 4 16 10 10 10 10 9 8 8 8 6 6 4 2 1 3 2 5 1 4 19 9 12 12 6 6 7 2 9 30 3 27 30 20 8 2 19 17 14 8 10 17 1 4 4 2 21 2 5 1 15 8 30 30 5 7 1 2 1 3 3 1 1 10 10 10 10 10 5 5 5 4 5 3 3 9 4 3 9 8 2 3 1 6 3 2 9 8 10 10 10 10 3 5 3 9 1328 1329 2882 2886 2863 2703 2866 2703 2887 1811 1 1812 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <trace/events/neigh.h> #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) { int i; switch (family) { default: DEBUG_NET_WARN_ON_ONCE(1); fallthrough; /* to avoid panic by null-ptr-deref */ case AF_INET: i = NEIGH_ARP_TABLE; break; case AF_INET6: i = NEIGH_ND_TABLE; break; } return &dev->neighbours[i]; } /* Neighbour hash table buckets are protected with rwlock tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static void neigh_mark_dead(struct neighbour *n) { n->dead = 1; if (!list_empty(&n->gc_list)) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } if (!list_empty(&n->managed_list)) list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; /* remove from the gc list if new state is permanent or if neighbor is * externally learned / validated; otherwise entry should be on the gc * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } else if (!exempt_from_gc && !on_gc_list) { /* add entries to the tail; cleaning removes from the front */ list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; write_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; add_to_managed = n->flags & NTF_MANAGED; on_managed_list = !list_empty(&n->managed_list); if (!add_to_managed && on_managed_list) list_del_init(&n->managed_list); else if (add_to_managed && !on_managed_list) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, bool *gc_update, bool *managed_update) { u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) return; ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; *notify = 1; *gc_update = true; } if ((old_flags ^ ndm_flags) & NTF_MANAGED) { if (ndm_flags & NTF_MANAGED) neigh->flags |= NTF_MANAGED; else neigh->flags &= ~NTF_MANAGED; *notify = 1; *managed_update = true; } if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { if (ndm_flags & NTF_EXT_VALIDATED) neigh->flags |= NTF_EXT_VALIDATED; else neigh->flags &= ~NTF_EXT_VALIDATED; *notify = 1; *gc_update = true; } } bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - READ_ONCE(tbl->gc_thresh2); u64 tmax = ktime_get_ns() + NSEC_PER_MSEC; unsigned long tref = jiffies - 5 * HZ; struct neighbour *n, *tmp; int shrunk = 0; int loop = 0; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { bool remove = false; write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || (n->nud_state == NUD_NOARP) || (tbl->is_multicast && tbl->is_multicast(n->primary_key)) || !time_in_range(n->updated, tref, jiffies)) remove = true; write_unlock(&n->lock); if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; if (++loop == 16) { if (ktime_get_ns() > tmax) goto unlock; loop = 0; } } } WRITE_ONCE(tbl->last_flush, jiffies); unlock: write_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { /* Use safe distance from the jiffies - LONG_MAX point while timer * is running in DELAY/PROBE state but still show to user space * large times in the past. */ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ); neigh_hold(n); if (!time_in_range(n->confirmed, mint, jiffies)) n->confirmed = mint; if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && timer_delete(&n->timer)) { neigh_release(n); return 1; } return 0; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_parms_qlen_dec(struct net_device *dev, int family) { struct neigh_parms *p; rcu_read_lock(); p = neigh_get_dev_parms_rcu(dev, family); if (p) p->qlen--; rcu_read_unlock(); } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, int family) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) { neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_one(struct neighbour *n) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. * We must destroy neighbour entry, * but someone still uses it. * * The destroy will be delayed until * the last user releases us, but * we must kill timers etc. and move * it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; WRITE_ONCE(n->output, neigh_blackhole); if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct hlist_head *dev_head; struct hlist_node *tmp; struct neighbour *n; dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { if (skip_perm && (n->nud_state & NUD_PERMANENT || n->flags & NTF_EXT_VALIDATED)) continue; neigh_flush_one(n); } } static void neigh_flush_table(struct neigh_table *tbl) { struct neigh_hash_table *nht; int i; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) neigh_flush_one(n); } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { write_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } write_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) timer_delete_sync(&tbl->proxy_timer); return 0; } int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, true); return 0; } EXPORT_SYMBOL(neigh_carrier_down); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { __neigh_ifdown(tbl, dev, false); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries, gc_thresh3; if (exempt_from_gc) goto do_alloc; entries = atomic_inc_return(&tbl->gc_entries) - 1; gc_thresh3 = READ_ONCE(tbl->gc_thresh3); if (entries >= gc_thresh3 || (entries >= READ_ONCE(tbl->gc_thresh2) && time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: return n; out_entries: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; } ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); kfree(nht->hash_heads); kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock(); return n; } EXPORT_SYMBOL(neigh_lookup); static struct neighbour * ___neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, u32 flags, bool exempt_from_gc, bool want_ref) { u32 hash_val, key_len = tbl->key_len; struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; int error; n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); if (n->flags & NTF_MANAGED) list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; } EXPORT_IPV6_MOD(pneigh_lookup); int pneigh_create(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) { err = -ENOBUFS; goto out; } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); update: WRITE_ONCE(n->flags, flags); n->permanent = permanent; WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); return err; } static void pneigh_destroy(struct rcu_head *rcu) { struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); netdev_put(n->dev, &n->dev_tracker); kfree(n); } int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, __rcu **np; unsigned int key_len; u32 hash_val; key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); mutex_lock(&tbl->phash_lock); for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); return 0; } } mutex_unlock(&tbl->phash_lock); return -ENOENT; } static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; mutex_lock(&tbl->phash_lock); for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); list_del(&n->free_node); if (tbl->pdestructor) tbl->pdestructor(n); call_rcu(&n->rcu, pneigh_destroy); } } static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) kfree(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->output); } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); WRITE_ONCE(neigh->output, neigh->ops->connected_output); } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; struct hlist_node *tmp; struct neighbour *n; unsigned int i; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } if (time_before(n->used, n->confirmed) && time_is_before_eq_jiffies(n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); consume_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_REACHABLE); neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); WRITE_ONCE(neigh->nud_state, NUD_PROBE); neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { if (neigh->nud_state == NUD_PROBE && neigh->flags & NTF_EXT_VALIDATED) { WRITE_ONCE(neigh->nud_state, NUD_STALE); neigh->updated = jiffies; } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh_invalidate(neigh); } notify = 1; goto out; } if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) neigh_update_notify(neigh, 0); trace_neigh_timer_handler(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, const bool immediate_ok) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); neigh->updated = now; if (!immediate_ok) { next = now + 1; } else { immediate_probe = true; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ / 100); } neigh_add_timer(neigh, next); } else { WRITE_ONCE(neigh->nud_state, NUD_FAILED); neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); WRITE_ONCE(neigh->nud_state, NUD_DELAY); neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); trace_neigh_event_send_done(neigh, rc); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_USE means that the entry is user triggered. NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed or invalidated. Caller MUST hold reference count on the entry. */ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (neigh->dead) { NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); new = old; goto out; } if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { new = old & ~NUD_PERMANENT; WRITE_ONCE(neigh->nud_state, new); err = 0; goto out; } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); WRITE_ONCE(neigh->nud_state, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) { NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; } lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); WRITE_ONCE(neigh->nud_state, new); notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) { struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } READ_ONCE(n1->output)(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); trace_neigh_update_done(neigh, err); return err; } int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE); atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_managed_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, managed_work.work); struct neighbour *neigh; write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); write_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } static unsigned long neigh_proxy_delay(struct neigh_parms *p) { /* If proxy_delay is zero, do not call get_random_u32_below() * as it is undefined behavior. */ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY); return proxy_delay ? jiffies + get_random_u32_below(proxy_delay) : jiffies; } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long sched_next = neigh_proxy_delay(p); if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } write_lock_bh(&tbl->lock); list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); INIT_LIST_HEAD(&tbl->managed_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); /* * Only called from ndisc_cleanup(), which means this is dead code * because we no longer can unload IPv6 module. */ int neigh_table_clear(int index, struct neigh_table *tbl) { RCU_INIT_POINTER(neigh_tables[index], NULL); synchronize_rcu(); /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } return tbl; } const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, [NDA_PROBES] = { .type = NLA_U32 }, [NDA_VLAN] = { .type = NLA_U16 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (!dst_attr) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; u32 ndm_flags; int err; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); if (err < 0) goto out; err = -EINVAL; if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; } ndm = nlmsg_data(nlh); ndm_flags = ndm->ndm_flags; if (tb[NDA_FLAGS_EXT]) { u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + hweight32(NTF_EXT_MASK))); ndm_flags |= (ext << NTF_EXT_SHIFT); } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; } dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } if (!dev) { NL_SET_ERR_MSG(extack, "Device not specified"); goto out; } if (tbl->allow_add && !tbl->allow_add(dev, extack)) { err = -EINVAL; goto out; } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || ndm_flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); err = -EINVAL; goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED will result in the neighbor * being created with an invalid state (NUD_NONE). */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = NUD_NONE; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot create externally validated neighbor with an invalid state"); err = -EINVAL; goto out; } } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & (NTF_EXT_LEARNED | NTF_MANAGED | NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (ndm_flags & NTF_EXT_VALIDATED) { u8 state = ndm->ndm_state; /* NTF_USE and NTF_MANAGED do not update the existing * state other than clearing it if it was * NUD_PERMANENT. */ if (ndm_flags & (NTF_USE | NTF_MANAGED)) state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; if (!(state & NUD_VALID)) { NL_SET_ERR_MSG(extack, "Cannot mark neighbor as externally validated with an invalid state"); err = -EINVAL; neigh_release(neigh); goto out; } } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } if (protocol) neigh->protocol = protocol; if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; if (ndm_flags & NTF_MANAGED) flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; if (ndm_flags & NTF_EXT_VALIDATED) flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval), NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) || nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) || nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3))) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - READ_ONCE(tbl->last_flush); long rand_delta = now - READ_ONCE(tbl->last_rand); struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += READ_ONCE(st->allocs); ndst.ndts_destroys += READ_ONCE(st->destroys); ndst.ndts_hash_grows += READ_ONCE(st->hash_grows); ndst.ndts_res_failed += READ_ONCE(st->res_failed); ndst.ndts_lookups += READ_ONCE(st->lookups); ndst.ndts_hits += READ_ONCE(st->hits); ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast); ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast); ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs); ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs); ndst.ndts_table_fulls += READ_ONCE(st->table_fulls); } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ write_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_INTERVAL_PROBE_TIME_MS: NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, nla_get_msecs(tbp[i])); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1])); if (tb[NDTA_THRESH2]) WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2])); if (tb[NDTA_THRESH3]) WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3])); if (tb[NDTA_GC_INTERVAL]) WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL])); err = 0; errout_tbl_lock: write_unlock_bh(&tbl->lock); errout: return err; } static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ndtmsg *ndtm; ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); return -EINVAL; } return 0; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; if (cb->strict_check) { int err = neightbl_valid_dump_info(nlh, cb->extack); if (err < 0) return err; } family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; neigh_flags = neigh->flags & NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { read_unlock_bh(&neigh->lock); goto nla_put_failure; } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; neigh_flags = READ_ONCE(pn->flags); neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; protocol = READ_ONCE(pn->protocol); if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; } struct neigh_dump_filter { int master_idx; int dev_idx; }; static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct net *net = sock_net(skb->sk); struct neighbour *n; int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; idx = 0; neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags); if (err < 0) goto out; next: idx++; } } out: cb->args[1] = h; cb->args[2] = idx; return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb, struct neigh_dump_filter *filter) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; n; n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); if (err < 0) goto out; next: idx++; } } out: cb->args[3] = h; cb->args[4] = idx; return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, bool strict_check, struct neigh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; int err, i; if (strict_check) { struct ndmsg *ndm; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); return -EINVAL; } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); } if (err < 0) return err; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; /* all new attributes should require strict_check */ switch (i) { case NDA_IFINDEX: filter->dev_idx = nla_get_u32(tb[i]); break; case NDA_MASTER: filter->master_idx = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); return -EINVAL; } } } return 0; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; err = 0; s_t = cb->args[0]; rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb, &filter); else err = neigh_dump_table(tbl, skb, cb, &filter); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = t; return err; } static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; ndm = nlmsg_payload(nlh, sizeof(*ndm)); if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); return ERR_PTR(-EINVAL); } if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { NL_SET_ERR_MSG(extack, "No device specified"); return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { switch (i) { case NDA_DST: if (!tb[i]) { NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); return ERR_PTR(-EINVAL); } break; default: if (!tb[i]) continue; NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); return ERR_PTR(-EINVAL); } } return ndm; } static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); u32 pid = NETLINK_CB(in_skb).portid; struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev = NULL; u32 seq = nlh->nlmsg_seq; struct neigh_table *tbl; struct neighbour *neigh; struct sk_buff *skb; struct ndmsg *ndm; void *dst; int err; ndm = neigh_valid_get_req(nlh, tb, extack); if (IS_ERR(ndm)) return PTR_ERR(ndm); if (ndm->ndm_flags & NTF_PROXY) skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); else skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; rcu_read_lock(); tbl = neigh_find_table(ndm->ndm_family); if (!tbl) { NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); err = -EAFNOSUPPORT; goto err_unlock; } if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); err = -EINVAL; goto err_unlock; } dst = nla_data(tb[NDA_DST]); if (ndm->ndm_ifindex) { dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); err = -ENODEV; goto err_unlock; } } if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); if (err) goto err_unlock; } else { neigh = neigh_lookup(tbl, dst, dev); if (!neigh) { NL_SET_ERR_MSG(extack, "Neighbour entry not found"); err = -ENOENT; goto err_unlock; } err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); neigh_release(neigh); if (err) goto err_unlock; } rcu_read_unlock(); return rtnl_unicast(skb, net, pid); err_unlock: rcu_read_unlock(); kfree_skb(skb); return err; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock(); nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { struct neigh_hash_table *nht; int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { hlist_del_rcu(&n->hash); hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); if (!tbl) goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock(); goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_valid(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); if (!net_eq(dev_net(n->dev), net)) return NULL; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); if (!v) return NULL; if (pos) return v; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) return n; if (READ_ONCE(n->nud_state) & ~NUD_NOARP) return n; return NULL; } static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; while (++state->bucket < (1 << nht->hash_shift)) { neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { tmp = neigh_get_valid(seq, n, NULL); if (tmp) return tmp; } } return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } hlist_for_each_entry_continue(n, hash) { tmp = neigh_get_valid(seq, n, pos); if (tmp) { n = tmp; goto out; } } n = neigh_get_first(seq); out: if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = rcu_dereference(tbl->phash_buckets[bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = rcu_dereference(tbl->phash_buckets[state->bucket]); while (pn && !net_eq(pneigh_net(pn), net)) pn = rcu_dereference(pn->next); if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); read_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; read_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx " "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct sk_buff *skb; int err = -ENOBUFS; struct net *net; rcu_read_lock(); net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); out: rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(const struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = SYSCTL_ZERO; tmp.extra2 = SYSCTL_INT_MAX; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; int min = msecs_to_jiffies(1); tmp.extra1 = &min; tmp.extra2 = NULL; ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; size_t neigh_vars_size; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } neigh_vars_size = ARRAY_SIZE(t->neigh_vars); if (dev) { dev_name_source = dev->name; /* Terminate the table early */ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p), neigh_path, t->neigh_vars, neigh_vars_size); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; static int __init neigh_init(void) { rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } subsys_initcall(neigh_init); |
| 8 3 7 7 1 1 7 7 7 2 2 1 1 11 1 1 12 12 1 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Syncookies implementation for the Linux kernel * * Copyright (C) 1997 Andi Kleen * Based on ideas by D.J.Bernstein and Eric Schenk. */ #include <linux/tcp.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK * stores TCP options: * * MSB LSB * | 31 ... 6 | 5 | 4 | 3 2 1 0 | * | Timestamp | ECN | SACK | WScale | * * When we receive a valid cookie-ACK, we look at the echoed tsval (if * any) to figure out which TCP options we should use for the rebuilt * connection. * * A WScale setting of '0xf' (which is an invalid scaling value) * means that original syn did not include the TCP window scaling option. */ #define TS_OPT_WSCALE_MASK 0xf #define TS_OPT_SACK BIT(4) #define TS_OPT_ECN BIT(5) /* There is no TS_OPT_TIMESTAMP: * if ACK contains timestamp option, we already know it was * requested/supported by the syn/synack exchange. */ #define TSBITS 6 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); return siphash_4u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, count, &syncookie_secret[c]); } /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be * sent in the syn-ack. * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; ts = (ts_now >> TSBITS) << TSBITS; ts |= options; if (ts > ts_now) ts -= (1UL << TSBITS); if (tcp_rsk(req)->req_usec_ts) return ts * NSEC_PER_USEC; return ts * NSEC_PER_MSEC; } static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. * The output should be: * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). * Where sseq is their sequence number and count increases every * minute by 1. * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } /* * This retrieves the small "data" value from the syncookie. * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * * The count value used to generate the cookie must be less than * MAX_SYNCOOKIE_AGE minutes in the past. * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq) { u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; /* Leaving the data behind */ } /* * MSS Values are chosen based on the 2011 paper * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. * Values .. * .. lower than 536 are rare (< 0.2%) * .. between 537 and 1299 account for less than < 1.5% of observed values * .. in the 1300-1349 range account for about 15 to 20% of observed mss values * .. exceeding 1460 are very rare (< 0.04%) * * 1460 is the single most frequently announced mss value (30 to 46% depending * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { 536, 1300, 1440, /* 1440, 1452: PPPoE */ 1460, }; /* * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v4_init_sequence(iph, th, mssp); } /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; bool own_req; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { reqsk_put(req); return child; } if (inet_csk_reqsk_queue_add(sk, req, child)) return child; bh_unlock_sock(child); sock_put(child); } __reqsk_free(req); return NULL; } EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * * return false if we decode a tcp option that is disabled * on the host. */ bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); return true; } if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); req->num_retrans = 0; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); ireq->ir_mark = inet_request_mark(sk, skb); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; treq->snt_synack = 0; treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = ntohl(th->ack_seq) - 1; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; treq->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) return mptcp_subflow_init_cookie_req(req, sk, skb); #endif return 0; } #if IS_ENABLED(CONFIG_BPF) struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) { struct request_sock *req = inet_reqsk(skb->sk); skb->sk = NULL; skb->destructor = NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); req = NULL; } return req; } EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff) { struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; if (sk_is_mptcp(sk)) req = mptcp_subflow_reqsk_alloc(ops, sk, false); else req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); return NULL; } ireq = inet_rsk(req); treq = tcp_rsk(req); req->mss = mss; req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; ireq->snd_wscale = tcp_opt->snd_wscale; ireq->tstamp_ok = tcp_opt->saw_tstamp; ireq->sack_ok = tcp_opt->sack_ok; ireq->wscale_ok = tcp_opt->wscale_ok; ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); treq->ts_off = tsoff; return req; } EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; struct rtable *rt; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct * window size. We should better make sure that the window size * hasn't changed since we received the original syn, but I see * no easy way to do this. */ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; } |
| 32 18 7 24 1 32 32 18 32 62 62 62 74 31 39 36 39 2 1 1 1 1 1 24 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 | // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/flow.h> #include <net/ip.h> #include "ipvlan.h" static u32 ipvlan_jhash_secret __read_mostly; void ipvlan_init_secret(void) { net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); } void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast) { if (likely(success)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->rx_pkts); u64_stats_add(&pcptr->rx_bytes, len); if (mcast) u64_stats_inc(&pcptr->rx_mcast); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->rx_errs); } } EXPORT_SYMBOL_GPL(ipvlan_count_rx); #if IS_ENABLED(CONFIG_IPV6) static u8 ipvlan_get_v6_hash(const void *iaddr) { const struct in6_addr *ip6_addr = iaddr; return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } #else static u8 ipvlan_get_v6_hash(const void *iaddr) { return 0; } #endif static u8 ipvlan_get_v4_hash(const void *iaddr) { const struct in_addr *ip4_addr = iaddr; return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr) { if (!is_v6 && addr->atype == IPVL_IPV4) { struct in_addr *i4addr = (struct in_addr *)iaddr; return addr->ip4addr.s_addr == i4addr->s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (is_v6 && addr->atype == IPVL_IPV6) { struct in6_addr *i6addr = (struct in6_addr *)iaddr; return ipv6_addr_equal(&addr->ip6addr, i6addr); #endif } return false; } static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, const void *iaddr, bool is_v6) { struct ipvl_addr *addr; u8 hash; hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : ipvlan_get_v4_hash(iaddr); hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) if (addr_equal(is_v6, addr, iaddr)) return addr; return NULL; } void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) { struct ipvl_port *port = ipvlan->port; u8 hash; hash = (addr->atype == IPVL_IPV6) ? ipvlan_get_v6_hash(&addr->ip6addr) : ipvlan_get_v4_hash(&addr->ip4addr); if (hlist_unhashed(&addr->hlnode)) hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); } void ipvlan_ht_addr_del(struct ipvl_addr *addr) { hlist_del_init_rcu(&addr->hlnode); } struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6) { struct ipvl_addr *addr, *ret = NULL; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) { if (addr_equal(is_v6, addr, iaddr)) { ret = addr; break; } } rcu_read_unlock(); return ret; } bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) { struct ipvl_dev *ipvlan; bool ret = false; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) { ret = true; break; } } rcu_read_unlock(); return ret; } void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) { void *lyr3h = NULL; switch (skb->protocol) { case htons(ETH_P_ARP): { struct arphdr *arph; if (unlikely(!pskb_may_pull(skb, arp_hdr_len(port->dev)))) return NULL; arph = arp_hdr(skb); *type = IPVL_ARP; lyr3h = arph; break; } case htons(ETH_P_IP): { u32 pktlen; struct iphdr *ip4h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) return NULL; ip4h = ip_hdr(skb); pktlen = skb_ip_totlen(skb); if (ip4h->ihl < 5 || ip4h->version != 4) return NULL; if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) return NULL; *type = IPVL_IPV4; lyr3h = ip4h; break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) return NULL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return NULL; *type = IPVL_IPV6; lyr3h = ip6h; /* Only Neighbour Solicitation pkts need different treatment */ if (ipv6_addr_any(&ip6h->saddr) && ip6h->nexthdr == NEXTHDR_ICMP) { struct icmp6hdr *icmph; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { /* Need to access the ipv6 address in body */ if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph) + sizeof(struct in6_addr)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); } *type = IPVL_ICMPV6; lyr3h = icmph; } break; } #endif default: return NULL; } return lyr3h; } unsigned int ipvlan_mac_hash(const unsigned char *addr) { u32 hash = jhash_1word(get_unaligned((u32 *)(addr + 2)), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; } void ipvlan_process_multicast(struct work_struct *work) { struct ipvl_port *port = container_of(work, struct ipvl_port, wq); struct ethhdr *ethh; struct ipvl_dev *ipvlan; struct sk_buff *skb, *nskb; struct sk_buff_head list; unsigned int len; unsigned int mac_hash; int ret; u8 pkt_type; bool tx_pkt; __skb_queue_head_init(&list); spin_lock_bh(&port->backlog.lock); skb_queue_splice_tail_init(&port->backlog, &list); spin_unlock_bh(&port->backlog.lock); while ((skb = __skb_dequeue(&list)) != NULL) { struct net_device *dev = skb->dev; bool consumed = false; ethh = eth_hdr(skb); tx_pkt = IPVL_SKB_CB(skb)->tx_pkt; mac_hash = ipvlan_mac_hash(ethh->h_dest); if (ether_addr_equal(ethh->h_dest, port->dev->broadcast)) pkt_type = PACKET_BROADCAST; else pkt_type = PACKET_MULTICAST; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (tx_pkt && (ipvlan->dev == skb->dev)) continue; if (!test_bit(mac_hash, ipvlan->mac_filters)) continue; if (!(ipvlan->dev->flags & IFF_UP)) continue; ret = NET_RX_DROP; len = skb->len + ETH_HLEN; nskb = skb_clone(skb, GFP_ATOMIC); local_bh_disable(); if (nskb) { consumed = true; nskb->pkt_type = pkt_type; nskb->dev = ipvlan->dev; if (tx_pkt) ret = dev_forward_skb(ipvlan->dev, nskb); else ret = netif_rx(nskb); } ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); local_bh_enable(); } rcu_read_unlock(); if (tx_pkt) { /* If the packet originated here, send it out. */ skb->dev = port->dev; skb->pkt_type = pkt_type; dev_queue_xmit(skb); } else { if (consumed) consume_skb(skb); else kfree_skb(skb); } dev_put(dev); cond_resched(); } } static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; if (dev) xnet = !net_eq(dev_net(skb->dev), dev_net(dev)); skb_scrub_packet(skb, xnet); if (dev) skb->dev = dev; } static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; unsigned int len; rx_handler_result_t ret = RX_HANDLER_CONSUMED; bool success = false; struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; /* Only packets exchanged between two local slaves need to have * device-up check as well as skb-share check. */ if (local) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; *pskb = skb; } if (local) { skb->pkt_type = PACKET_HOST; if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { skb->dev = dev; ret = RX_HANDLER_ANOTHER; success = true; } out: ipvlan_count_rx(ipvlan, len, success, false); return ret; } struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest) { struct ipvl_addr *addr = NULL; switch (addr_type) { #if IS_ENABLED(CONFIG_IPV6) case IPVL_IPV6: { struct ipv6hdr *ip6h; struct in6_addr *i6addr; ip6h = (struct ipv6hdr *)lyr3h; i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; addr = ipvlan_ht_addr_lookup(port, i6addr, true); break; } case IPVL_ICMPV6: { struct nd_msg *ndmh; struct in6_addr *i6addr; /* Make sure that the NeighborSolicitation ICMPv6 packets * are handled to avoid DAD issue. */ ndmh = (struct nd_msg *)lyr3h; if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { i6addr = &ndmh->target; addr = ipvlan_ht_addr_lookup(port, i6addr, true); } break; } #endif case IPVL_IPV4: { struct iphdr *ip4h; __be32 *i4addr; ip4h = (struct iphdr *)lyr3h; i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; addr = ipvlan_ht_addr_lookup(port, i4addr, false); break; } case IPVL_ARP: { struct arphdr *arph; unsigned char *arp_ptr; __be32 dip; arph = (struct arphdr *)lyr3h; arp_ptr = (unsigned char *)(arph + 1); if (use_dest) arp_ptr += (2 * port->dev->addr_len) + 4; else arp_ptr += port->dev->addr_len; memcpy(&dip, arp_ptr, 4); addr = ipvlan_ht_addr_lookup(port, &dip, false); break; } } return addr; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; const struct iphdr *ip4h; struct rtable *rt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, }; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; fl4.flowi4_dscp = ip4h_dscp(ip4h); rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto err; } skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out; err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static noinline_for_stack int ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; struct dst_entry *dst; int err; dst = ip6_route_output(dev_net(dev), NULL, &fl6); err = dst->error; if (err) { dst_release(dst); return err; } skb_dst_set(skb, dst); return 0; } static int ipvlan_process_v6_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return err; } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; return ret; } #else static int ipvlan_process_v6_outbound(struct sk_buff *skb) { return NET_XMIT_DROP; } #endif static int ipvlan_process_outbound(struct sk_buff *skb) { int ret = NET_XMIT_DROP; /* The ipvlan is a pseudo-L2 device, so the packets that we receive * will have L2; which need to discarded and processed further * in the net-ns of the main-device. */ if (skb_mac_header_was_set(skb)) { /* In this mode we dont care about * multicast and broadcast traffic */ struct ethhdr *ethh = eth_hdr(skb); if (is_multicast_ether_addr(ethh->h_dest)) { pr_debug_ratelimited( "Dropped {multi|broad}cast of type=[%x]\n", ntohs(skb->protocol)); kfree_skb(skb); goto out; } skb_pull(skb, sizeof(*ethh)); skb->mac_header = (typeof(skb->mac_header))~0U; skb_reset_network_header(skb); } if (skb->protocol == htons(ETH_P_IPV6)) ret = ipvlan_process_v6_outbound(skb); else if (skb->protocol == htons(ETH_P_IP)) ret = ipvlan_process_v4_outbound(skb); else { pr_warn_ratelimited("Dropped outbound packet type=%x\n", ntohs(skb->protocol)); kfree_skb(skb); } out: return ret; } static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); return; } /* Record that the deferred packet is from TX or RX path. By * looking at mac-addresses on packet will lead to erronus decisions. * (This would be true for a loopback-mode on master device or a * hair-pin mode of the switch.) */ IPVL_SKB_CB(skb)->tx_pkt = tx_pkt; spin_lock(&port->backlog.lock); if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) { dev_hold(skb->dev); __skb_queue_tail(&port->backlog, skb); spin_unlock(&port->backlog.lock); schedule_work(&port->wq); } else { spin_unlock(&port->backlog.lock); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb(skb); } } static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); void *lyr3h; struct ipvl_addr *addr; int addr_type; lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (!lyr3h) goto out; if (!ipvlan_is_vepa(ipvlan->port)) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } out: ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); return ipvlan_process_outbound(skb); } static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = skb_eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (!ipvlan_is_vepa(ipvlan->port) && ether_addr_equal(eth->h_dest, eth->h_source)) { lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Packet definitely does not belong to any of the * virtual devices, but the dest is local. So forward * the skb for the main-dev. At the RX side we just return * RX_PASS for it to be processed further on the stack. */ dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; } else if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); ipvlan_skb_crossing_ns(skb, NULL); ipvlan_multicast_enqueue(ipvlan->port, skb, true); return NET_XMIT_SUCCESS; } skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); } int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev); if (!port) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) goto out; switch(port->mode) { case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); out: kfree_skb(skb); return NET_XMIT_DROP; } static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) { struct ethhdr *eth = eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) return true; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); if (addr) return false; } return true; } static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, struct ipvl_port *port) { void *lyr3h; int addr_type; struct ipvl_addr *addr; struct sk_buff *skb = *pskb; rx_handler_result_t ret = RX_HANDLER_PASS; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) ret = ipvlan_rcv_frame(addr, pskb, false); out: return ret; } static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { struct sk_buff *skb = *pskb; struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; if (is_multicast_ether_addr(eth->h_dest)) { if (ipvlan_external_frame(skb, port)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ if (nskb) { ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } } } else { /* Perform like l3 mode for non-multicast packet */ ret = ipvlan_handle_mode_l3(pskb, port); } return ret; } rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); if (!port) return RX_HANDLER_PASS; switch (port->mode) { case IPVLAN_MODE_L2: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; #endif } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); kfree_skb(skb); return RX_HANDLER_CONSUMED; } |
| 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0 /* * Hibernation support for x86 * * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> #include <linux/scatterlist.h> #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/pgtable.h> #include <linux/types.h> #include <linux/crc32.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> #include <asm/tlbflush.h> /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ unsigned long restore_jump_address __visible; unsigned long jump_address_phys; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ unsigned long restore_cr3 __visible; unsigned long temp_pgt __visible; unsigned long relocated_restore_code __visible; /** * pfn_is_nosave - check if given pfn is in the 'nosave' section * @pfn: the page frame number to check. */ int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn; unsigned long nosave_end_pfn; nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; } struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; unsigned long e820_checksum; }; /** * compute_e820_crc32 - calculate crc32 of a given e820 table * * @table: the e820 table to be calculated * * Return: the resulting checksum */ static inline u32 compute_e820_crc32(struct e820_table *table) { int size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; return ~crc32_le(~0, (unsigned char const *)table, size); } #ifdef CONFIG_X86_64 #define RESTORE_MAGIC 0x23456789ABCDEF02UL #else #define RESTORE_MAGIC 0x12345679UL #endif /** * arch_hibernation_header_save - populate the architecture specific part * of a hibernation image header * @addr: address where architecture specific header data will be saved. * @max_size: maximum size of architecture specific data in hibernation header. * * Return: 0 on success, -EOVERFLOW if max_size is insufficient. */ int arch_hibernation_header_save(void *addr, unsigned int max_size) { struct restore_data_record *rdr = addr; if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; rdr->magic = RESTORE_MAGIC; rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); /* * The restore code fixes up CR3 and CR4 in the following sequence: * * [in hibernation asm] * 1. CR3 <= temporary page tables * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) * 3. CR3 <= rdr->cr3 * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) * [in restore_processor_state()] * 5. CR4 <= saved CR4 * 6. CR3 <= saved CR3 * * Our mmu_cr4_features has CR4.PCIDE=0, and toggling * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so * rdr->cr3 needs to point to valid page tables but must not * have any of the PCID bits set. */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; rdr->e820_checksum = compute_e820_crc32(e820_table_firmware); return 0; } /** * arch_hibernation_header_restore - read the architecture specific data * from the hibernation image header * @addr: address to read the data from */ int arch_hibernation_header_restore(void *addr) { struct restore_data_record *rdr = addr; if (rdr->magic != RESTORE_MAGIC) { pr_crit("Unrecognized hibernate image header format!\n"); return -EINVAL; } restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) { pr_crit("Hibernate inconsistent memory map detected!\n"); return -ENODEV; } return 0; } int relocate_restore_code(void) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) return -ENOMEM; __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_leaf(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); goto out; } pud = pud_offset(p4d, relocated_restore_code); if (pud_leaf(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); goto out; } pmd = pmd_offset(pud, relocated_restore_code); if (pmd_leaf(*pmd)) { set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); goto out; } pte = pte_offset_kernel(pmd, relocated_restore_code); set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); out: __flush_tlb_all(); return 0; } int arch_resume_nosmt(void) { int ret; /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe * against control transition during resume (see comment in * hibernate_resume_nonboot_cpu_disable()). * * If the resumed kernel has SMT disabled, we have to take all the * SMT siblings out of hlt, and offline them again so that they * end up in mwait proper. * * Called with hotplug disabled. */ cpu_hotplug_enable(); ret = arch_cpu_rescan_dead_smt_siblings(); cpu_hotplug_disable(); return ret; } |
| 154 154 17 7 17 2852 2770 2766 1490 1491 1488 5 233 62 448 449 1 1 6 2 44 43 18 610 384 448 390 76 19 616 7 611 410 2 404 405 616 615 200 617 51 40 42 616 616 614 1 3 1 1 1 1 1 1 1 42 13 28 23 3 20 4 15 1 6 9 12 14 7 21 13 11 5 25 25 24 23 20 1 19 1 19 18 7 16 34 2 33 35 27 21 17 32 32 32 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 | // SPDX-License-Identifier: GPL-2.0-only /* * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules * * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project * * Authors * Thomas Graf <tgraf@suug.ch> * Ville Nuorvala <vnuorval@tcs.hut.fi> */ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/export.h> #include <linux/indirect_call_wrapper.h> #include <net/fib_rules.h> #include <net/inet_dscp.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/netlink.h> struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; __be32 flowlabel; __be32 flowlabel_mask; dscp_t dscp; dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ }; static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } bool fib6_rule_default(const struct fib_rule *rule) { if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN) return false; return true; } EXPORT_SYMBOL_GPL(fib6_rule_default); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { int err; if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = fib6_table_lookup, .lookup_data = &oif, .result = res, .flags = FIB_LOOKUP_NOREF, }; l3mdev_update_flow(net, flowi6_to_flowi(fl6)); err = fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); } else { err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif, fl6, res, flags); if (err || res->f6i == net->ipv6.fib6_null_entry) err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } return err; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib6_result res = {}; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .lookup_data = skb, .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(fl6)); fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); if (res.rt6) return &res.rt6->dst; } else { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); } if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, struct flowi6 *flp6, const struct net_device *dev) { struct fib6_rule *r = (struct fib6_rule *)rule; /* If we need to find a source address for this traffic, * we check the result if it meets requirement of the rule. */ if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; if (ipv6_dev_get_saddr(net, dev, &flp6->daddr, rt6_flags2srcprefs(flags), &saddr)) return -EAGAIN; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) return -EAGAIN; flp6->saddr = saddr; } return 0; } static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct net *net = rule->fr_net; struct fib6_table *table; int err, *oif; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) return -EAGAIN; oif = (int *)arg->lookup_data; err = fib6_table_lookup(net, table, *oif, flp6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) err = fib6_rule_saddr(net, rule, flags, flp6, res->nh->fib_nh_dev); else err = -EAGAIN; return err; } static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; } rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct inet6_dev *idev = ip6_dst_idev(&rt->dst); if (!idev) goto again; err = fib6_rule_saddr(net, rule, flags, flp6, idev->dev); if (err == -EAGAIN) goto again; err = rt->dst.error; if (err != -EAGAIN) goto out; } again: ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); out: res->rt6 = rt; return err; } INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { if (arg->lookup_ptr == fib6_table_lookup) return fib6_rule_action_alt(rule, flp, flags, arg); return __fib6_rule_action(rule, flp, flags, arg); } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct rt6_info *rt = res->rt6; struct net_device *dev = NULL; if (!rt) return false; if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: ip6_rt_put_flags(rt, flags); return true; } INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; if (r->dst.plen && !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* * If FIB_RULE_FIND_SADDR is set and we do not have a * source address for the traffic, we defer check for * source address. */ if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) return 0; if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, fl6->fl6_sport)) return 0; if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, fl6->fl6_dport)) return 0; return 1; } static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { if (rule6->dscp) { NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); return -EINVAL; } rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule6->dscp_full = true; return 0; } static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { dscp_t dscp_mask; if (!rule6->dscp_full) { NL_SET_ERR_MSG_ATTR(extack, nla, "Cannot specify DSCP mask without DSCP value"); return -EINVAL; } dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); if (rule6->dscp & ~dscp_mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); return -EINVAL; } rule6->dscp_mask = dscp_mask; return 0; } static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { __be32 flowlabel, flowlabel_mask; if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) return -EINVAL; flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], "Invalid flow label mask"); return -EINVAL; } if (flowlabel & ~flowlabel_mask) { NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); return -EINVAL; } rule6->flowlabel = flowlabel; rule6->flowlabel_mask = flowlabel_mask; return 0; } static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct fib6_rule *rule6 = (struct fib6_rule *)rule; struct net *net = rule->fr_net; int err = -EINVAL; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; if (tb[FRA_DSCP_MASK] && fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) goto errout; if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) goto errout; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); goto errout; } if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; goto errout; } } if (frh->src_len) rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]); if (frh->dst_len) rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]); rule6->src.plen = frh->src_len; rule6->dst.plen = frh->dst_len; if (fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect++; net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } static int fib6_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; if (net->ipv6.fib6_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect--; return 0; } static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (frh->src_len && (rule6->src.plen != frh->src_len)) return 0; if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; if (frh->tos && (rule6->dscp_full || inet_dscp_to_dsfield(rule6->dscp) != frh->tos)) return 0; if (tb[FRA_DSCP]) { dscp_t dscp; dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); if (!rule6->dscp_full || rule6->dscp != dscp) return 0; } if (tb[FRA_DSCP_MASK]) { dscp_t dscp_mask; dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) return 0; } if (tb[FRA_FLOWLABEL] && nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) return 0; if (tb[FRA_FLOWLABEL_MASK] && nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) return 0; if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; if (frh->dst_len && nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) return 0; return 1; } static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; if (rule6->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, inet_dscp_to_dsfield(rule6->dscp) >> 2) || nla_put_u8(skb, FRA_DSCP_MASK, inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule6->dscp); } if (rule6->flowlabel_mask && (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) goto nla_put_failure; if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr))) goto nla_put_failure; return 0; nla_put_failure: return -ENOBUFS; } static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ + nla_total_size(1) /* dscp */ + nla_total_size(1) /* dscp mask */ + nla_total_size(4) /* flowlabel */ + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) { rt_genid_bump_ipv6(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, .flush_cache = fib6_rule_flush_cache, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, .fro_net = &init_net, }; static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL); if (err) goto out_fib6_rules_ops; err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN); if (err) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; net->ipv6.fib6_rules_require_fldissect = 0; out: return err; out_fib6_rules_ops: fib_rules_unregister(ops); goto out; } static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { fib_rules_unregister(net->ipv6.fib6_rules_ops); cond_resched(); } rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit_batch = fib6_rules_net_exit_batch, }; int __init fib6_rules_init(void) { return register_pernet_subsys(&fib6_rules_net_ops); } void fib6_rules_cleanup(void) { unregister_pernet_subsys(&fib6_rules_net_ops); } |
| 8 8 9 9 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> #include <linux/completion.h> #include <asm/msr.h> static void __rdmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; if (rv->msrs) reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; rdmsr(rv->msr_no, reg->l, reg->h); } static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; if (rv->msrs) reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; wrmsr(rv->msr_no, reg->l, reg->h); } int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.reg.l; *h = rv.reg.h; return err; } EXPORT_SYMBOL(rdmsr_on_cpu); int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *q = rv.reg.q; return err; } EXPORT_SYMBOL(rdmsrq_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsr_on_cpu); int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsrq_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs, void (*msr_func) (void *info)) { struct msr_info rv; int this_cpu; memset(&rv, 0, sizeof(rv)); rv.msrs = msrs; rv.msr_no = msr_no; this_cpu = get_cpu(); if (cpumask_test_cpu(this_cpu, mask)) msr_func(&rv); smp_call_function_many(mask, msr_func, &rv, 1); put_cpu(); } /* rdmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); } EXPORT_SYMBOL(rdmsr_on_cpus); /* * wrmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } EXPORT_SYMBOL(wrmsr_on_cpus); struct msr_info_completion { struct msr_info msr; struct completion done; }; /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) { struct msr_info_completion *rv = info; rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h); complete(&rv->done); } static void __wrmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); } int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; call_single_data_t csd; int err; INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; err = smp_call_function_single_async(cpu, &csd); if (!err) { wait_for_completion(&rv.done); err = rv.msr.err; } *l = rv.msr.reg.l; *h = rv.msr.reg.h; return err; } EXPORT_SYMBOL(rdmsr_safe_on_cpu); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsrq_safe_on_cpu); int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high); *q = (u64)high << 32 | low; return err; } EXPORT_SYMBOL(rdmsrq_safe_on_cpu); /* * These variants are significantly slower, but allows control over * the entire 32-bit GPR set. */ static void __rdmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = rdmsr_safe_regs(rv->regs); } static void __wrmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = wrmsr_safe_regs(rv->regs); } int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); |
| 35 34 3 35 32 2 2 33 7 6 1 1 34 21 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Misc and compatibility things * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fs.h> #include <sound/core.h> void release_and_free_resource(struct resource *res) { if (res) { release_resource(res); kfree(res); } } EXPORT_SYMBOL(release_and_free_resource); #ifdef CONFIG_PCI #include <linux/pci.h> /** * snd_pci_quirk_lookup_id - look up a PCI SSID quirk list * @vendor: PCI SSV id * @device: PCI SSD id * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { const struct snd_pci_quirk *q; for (q = list; q->subvendor || q->subdevice; q++) { if (q->subvendor != vendor) continue; if (!q->subdevice || (device & q->subdevice_mask) == q->subdevice) return q; } return NULL; } EXPORT_SYMBOL(snd_pci_quirk_lookup_id); /** * snd_pci_quirk_lookup - look up a PCI SSID quirk list * @pci: pci_dev handle * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { if (!pci) return NULL; return snd_pci_quirk_lookup_id(pci->subsystem_vendor, pci->subsystem_device, list); } EXPORT_SYMBOL(snd_pci_quirk_lookup); #endif /* * Deferred async signal helpers * * Below are a few helper functions to wrap the async signal handling * in the deferred work. The main purpose is to avoid the messy deadlock * around tasklist_lock and co at the kill_fasync() invocation. * fasync_helper() and kill_fasync() are replaced with snd_fasync_helper() * and snd_kill_fasync(), respectively. In addition, snd_fasync_free() has * to be called at releasing the relevant file object. */ struct snd_fasync { struct fasync_struct *fasync; int signal; int poll; int on; struct list_head list; }; static DEFINE_SPINLOCK(snd_fasync_lock); static LIST_HEAD(snd_fasync_list); static void snd_fasync_work_fn(struct work_struct *work) { struct snd_fasync *fasync; spin_lock_irq(&snd_fasync_lock); while (!list_empty(&snd_fasync_list)) { fasync = list_first_entry(&snd_fasync_list, struct snd_fasync, list); list_del_init(&fasync->list); spin_unlock_irq(&snd_fasync_lock); if (fasync->on) kill_fasync(&fasync->fasync, fasync->signal, fasync->poll); spin_lock_irq(&snd_fasync_lock); } spin_unlock_irq(&snd_fasync_lock); } static DECLARE_WORK(snd_fasync_work, snd_fasync_work_fn); int snd_fasync_helper(int fd, struct file *file, int on, struct snd_fasync **fasyncp) { struct snd_fasync *fasync = NULL; if (on) { fasync = kzalloc(sizeof(*fasync), GFP_KERNEL); if (!fasync) return -ENOMEM; INIT_LIST_HEAD(&fasync->list); } scoped_guard(spinlock_irq, &snd_fasync_lock) { if (*fasyncp) { kfree(fasync); fasync = *fasyncp; } else { if (!fasync) return 0; *fasyncp = fasync; } fasync->on = on; } return fasync_helper(fd, file, on, &fasync->fasync); } EXPORT_SYMBOL_GPL(snd_fasync_helper); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll) { if (!fasync || !fasync->on) return; guard(spinlock_irqsave)(&snd_fasync_lock); fasync->signal = signal; fasync->poll = poll; list_move(&fasync->list, &snd_fasync_list); schedule_work(&snd_fasync_work); } EXPORT_SYMBOL_GPL(snd_kill_fasync); void snd_fasync_free(struct snd_fasync *fasync) { if (!fasync) return; fasync->on = 0; flush_work(&snd_fasync_work); kfree(fasync); } EXPORT_SYMBOL_GPL(snd_fasync_free); |
| 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Sysctl interface to net af_unix subsystem. * * Authors: Mike Shaver. */ #include <linux/slab.h> #include <linux/string.h> #include <linux/sysctl.h> #include <net/af_unix.h> #include <net/net_namespace.h> #include "af_unix.h" static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", .data = &init_net.unx.sysctl_max_dgram_qlen, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, }; int __net_init unix_sysctl_register(struct net *net) { struct ctl_table *table; if (net_eq(net, &init_net)) { table = unix_table; } else { table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->unx.sysctl_max_dgram_qlen; } net->unx.ctl = register_net_sysctl_sz(net, "net/unix", table, ARRAY_SIZE(unix_table)); if (net->unx.ctl == NULL) goto err_reg; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } void unix_sysctl_unregister(struct net *net) { const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); if (!net_eq(net, &init_net)) kfree(table); } |
| 36 83 134 138 117 697 52 268 52 11040 817 381 9585 864 41 1804 6 3 6 1 2080 62 2120 4 69 631 3373 11 13 3371 13 3378 2527 19 499 79 67 88 25 21 7 1426 1485 1704 883 5 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int timer_create_restore_ids:1; atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif #ifdef CONFIG_CGROUPS struct rw_semaphore cgroup_threadgroup_rwsem; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(¤t->sighand->siglock); schedule(); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr); int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(¤t->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = ¤t->blocked; if (unlikely(test_restore_sigmask())) res = ¤t->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define for_other_threads(p, t) \ for (t = p; (t = next_thread(t)) != p; ) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } /* * returns NULL if p is the last thread in the thread group */ static inline struct task_struct *__next_thread(struct task_struct *p) { return list_next_or_null_rcu(&p->signal->thread_head, &p->thread_node, struct task_struct, thread_node); } static inline struct task_struct *next_thread(struct task_struct *p) { return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { return thread_group_leader(p) && list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */ |
| 2 11 2 33 7 64 106 61 6 7 6 1 5 77 3 11 4 11 4 4 19 2 16 98 18 2 185 2 188 187 188 188 188 5 5 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GRO_H #define _NET_GRO_H #include <linux/indirect_call_wrapper.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> #include <net/hotdata.h> /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) struct napi_gro_cb { union { struct { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ void *frag0; /* Length of frag0. */ unsigned int frag0_len; }; struct { /* used in skb_gro_receive() slow path */ struct sk_buff *last; /* jiffies when first packet was created/queued */ unsigned long age; }; }; /* This indicates where we are processing relative to skb->data. */ int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; /* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; u16 pad; /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, /* Start offset for remote checksum offload */ u16 gro_remcsum_start; /* This is non-zero if the packet may be of the same flow. */ u8 same_flow:1; /* Used in tunnel GRO receive */ u8 encap_mark:1; /* GRO checksum is valid */ u8 csum_valid:1; /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; /* Free the skb? */ u8 free:2; /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; /* GRO is done by frag_list pointer chaining. */ u8 is_flist:1; ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; /* L3 offsets */ union { struct { u16 network_offset; u16 inner_network_offset; }; u16 network_offsets[2]; }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) #define GRO_RECURSION_LIMIT 15 static inline int gro_recursion_inc_test(struct sk_buff *skb) { return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive(gro_receive_t cb, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(head, skb); } typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, struct sock *sk, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(sk, head, skb); } static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->data_offset; } static inline unsigned int skb_gro_len(const struct sk_buff *skb) { return skb->len - NAPI_GRO_CB(skb)->data_offset; } static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) { NAPI_GRO_CB(skb)->data_offset += len; } static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } static inline bool skb_gro_may_pull(const struct sk_buff *skb, unsigned int hlen) { return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; return skb->data + offset; } static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; } static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); return skb->data + skb_gro_receive_network_offset(skb); } static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); return csum_tcpudp_nofold(iph->saddr, iph->daddr, skb_gro_len(skb), proto, 0); } static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len, wsum_negate(NAPI_GRO_CB(skb)->csum))); } /* GRO checksum functions. These are logical equivalents of the normal * checksum functions (in skbuff.h) except that they operate on the GRO * offsets and fields in sk_buff. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { return ((skb->ip_summed != CHECKSUM_PARTIAL || skb_checksum_start_offset(skb) < skb_gro_offset(skb)) && !skb_at_gro_remcsum_start(skb) && NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, __wsum psum) { if (NAPI_GRO_CB(skb)->csum_valid && !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) return 0; NAPI_GRO_CB(skb)->csum = psum; return __skb_gro_checksum_complete(skb); } static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->csum_cnt > 0) { /* Consume a checksum from CHECKSUM_UNNECESSARY */ NAPI_GRO_CB(skb)->csum_cnt--; } else { /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we * verified a new top level checksum or an encapsulated one * during GRO. This saves work if we fallback to normal path. */ __skb_incr_checksum_unnecessary(skb); } } #define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ compute_pseudo) \ ({ \ __sum16 __ret = 0; \ if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ if (!__ret) \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) #define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) #define skb_gro_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid); } static inline void __skb_gro_checksum_convert(struct sk_buff *skb, __wsum pseudo) { NAPI_GRO_CB(skb)->csum = ~pseudo; NAPI_GRO_CB(skb)->csum_valid = 1; } #define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_gro_checksum_convert_check(skb)) \ __skb_gro_checksum_convert(skb, \ compute_pseudo(skb, proto)); \ } while (0) struct gro_remcsum { int offset; __wsum delta; }; static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) { grc->offset = 0; grc->delta = 0; } static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, unsigned int off, size_t hdrlen, int start, int offset, struct gro_remcsum *grc, bool nopartial) { __wsum delta; size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; return ptr; } ptr = skb_gro_header(skb, off + plen, off); if (!ptr) return NULL; delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); grc->offset = off + hdrlen + offset; grc->delta = delta; return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { void *ptr; size_t plen = grc->offset + sizeof(u16); if (!grc->delta) return; ptr = skb_gro_header(skb, plen, grc->offset); if (!ptr) return; remcsum_unadjust((__sum16 *)ptr, grc->delta); } #ifdef CONFIG_XFRM_OFFLOAD static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { if (PTR_ERR(pp) != -EINPROGRESS) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } } #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } #endif INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); #define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ }) struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header(skb, hlen, off); return uh; } static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, struct sk_buff *p, bool inner) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; /* All fields must match except length and checksum. */ if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) { /* <Version:4><Traffic_Class:8><Flow_Label:20> */ __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* Flush if Traffic Class fields are different. */ return !!((first_word & htonl(0x0FF00000)) | (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); } static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, bool inner) { const void *nh = th - diff; const void *nh2 = th2 - diff; if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else return inet_gro_flush(nh, nh2, p, inner); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { int off = skb_transport_offset(p); int flush; flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); if (NAPI_GRO_CB(p)->encap_mark) flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); return flush; } int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); void __gro_flush(struct gro_node *gro, bool flush_old); static inline void gro_flush(struct gro_node *gro, bool flush_old) { if (!gro->bitmask) return; __gro_flush(gro, flush_old); } static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) { gro_flush(&napi->gro, flush_old); } /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ static inline void gro_normal_list(struct gro_node *gro) { if (!gro->rx_count) return; netif_receive_skb_list_internal(&gro->rx_list); INIT_LIST_HEAD(&gro->rx_list); gro->rx_count = 0; } static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) { gro_flush(gro, flush_old); gro_normal_list(gro); } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &gro->rx_list); gro->rx_count += segs; if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(gro); } void gro_init(struct gro_node *gro); void gro_cleanup(struct gro_node *gro); /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { *iif = inet_iif(skb) ?: skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } /* This function is the alternative of 'inet6_iif' and 'inet6_sdif' * functions in case we can not rely on fields of IP6CB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ *iif = skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); #endif /* _NET_GRO_H */ |
| 584 585 584 585 585 585 464 585 584 585 585 252 252 252 330 8014 7321 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu-vm.c - vmalloc area based chunk allocation * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ #include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { /* must not be used on pre-mapped chunk */ WARN_ON(chunk->immutable); return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /** * pcpu_get_pages - get temp pages array * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } /** * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * * Free pages [@page_start and @page_end) in @pages for all units. * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page = pages[pcpu_page_idx(cpu, i)]; if (page) __free_page(page); } } } /** * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end, gfp_t gfp) { unsigned int cpu, tcpu; int i; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); if (!*pagep) goto err; } } return 0; err: while (--i >= page_start) __free_page(pages[pcpu_page_idx(cpu, i)]); for_each_possible_cpu(tcpu) { if (tcpu == cpu) break; for (i = page_start; i < page_end; i++) __free_page(pages[pcpu_page_idx(tcpu, i)]); } return -ENOMEM; } /** * pcpu_pre_unmap_flush - flush cache prior to unmapping * @chunk: chunk the regions to be flushed belongs to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages in [@page_start,@page_end) of @chunk are about to be * unmapped. Flush cache. As each flushing trial can be very * expensive, issue flush on the whole region at once rather than * doing it for each cpu. This could be an overkill but is more * scalable. */ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * Corresponding elements in @pages were cleared by the caller and can * be used to carry information to pcpu_free_pages() which will be * called after all unmaps are finished. The caller should call * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page; page = pcpu_chunk_page(chunk, cpu, i); WARN_ON(!page); pages[pcpu_page_idx(cpu, i)] = page; } __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } } /** * pcpu_post_unmap_tlb_flush - flush TLB after unmapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush * TLB for the regions. This can be skipped if the area is to be * returned to vmalloc as vmalloc will handle TLB flushing lazily. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), PAGE_KERNEL, pages, PAGE_SHIFT); } /** * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * * For each cpu, map pages [@page_start,@page_end) into @chunk. The * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * * This function is responsible for setting up whatever is necessary for * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) goto err; for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); } return 0; err: for_each_possible_cpu(tcpu) { __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); if (tcpu == cpu) break; } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } /** * pcpu_post_map_flush - flush cache after mapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been mapped. Flush * cache. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp) { struct page **pages; pages = pcpu_get_pages(); if (!pages) return -ENOMEM; if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { pcpu_free_pages(chunk, pages, page_start, page_end); return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @page_start: the start page * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. * * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the * region back to vmalloc() which will lazily flush the tlb. * * CONTEXT: * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end) { struct page **pages; /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_unmap_pages(chunk, pages, page_start, page_end); pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; } chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; pcpu_stats_chunk_dealloc(); trace_percpu_destroy_chunk(chunk->base_addr); if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } static struct page *pcpu_addr_to_page(void *addr) { return vmalloc_to_page(addr); } static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) { /* no extra restriction */ return 0; } /** * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim * @chunk: chunk of interest * * This is the entry point for percpu reclaim. If a chunk qualifies, it is then * isolated and managed in separate lists at the back of pcpu_slot: sidelined * and to_depopulate respectively. The to_depopulate list holds chunks slated * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once * they are on this list. Once depopulated, they are moved onto the sidelined * list which enables them to be pulled back in for allocation if no other chunk * can suffice the allocation. */ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) { /* do not reclaim either the first chunk or reserved chunk */ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) return false; /* * If it is isolated, it may be on the sidelined list so move it back to * the to_depopulate list. If we hit at least 1/4 pages empty pages AND * there is no system-wide shortage of empty pages aside from this * chunk, move it to the to_depopulate list. */ return ((chunk->isolated && chunk->nr_empty_pop_pages) || (pcpu_nr_empty_pop_pages > (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); } |
| 47 47 72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Antonio Quartulli */ #include "bat_v_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" /** * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * * Return: the orig_node corresponding to the specified address. If such an * object does not exist, it is allocated here. In case of allocation failure * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; } return orig_node; } /** * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) { unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ msecs += get_random_u32_below(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } /** * batadv_v_ogm_start_timer() - restart the OGM sending timer * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) { unsigned long msecs; /* this function may be invoked in different contexts (ogm rescheduling * or hard_iface activation), but the work timer should not be reset */ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq)) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } /** * batadv_v_ogm_send_to_if() - send a batman ogm using a given interface * @skb: the OGM to send * @hard_iface: the interface to use to send the OGM */ static void batadv_v_ogm_send_to_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); return; } batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } /** * batadv_v_ogm_len() - OGMv2 packet length * @skb: the OGM to check * * Return: Length of the given OGMv2 packet, including tvlv length, excluding * ethernet header length. */ static unsigned int batadv_v_ogm_len(struct sk_buff *skb) { struct batadv_ogm2_packet *ogm_packet; ogm_packet = (struct batadv_ogm2_packet *)skb->data; return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len); } /** * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue * @skb: the OGM to check * @hard_iface: the interface to use to send the OGM * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. * * Return: True, if the given OGMv2 packet still fits, false otherwise. */ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu, BATADV_MAX_AGGREGATION_BYTES); unsigned int ogm_len = batadv_v_ogm_len(skb); lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); return hard_iface->bat_v.aggr_len + ogm_len <= max; } /** * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface) { lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); __skb_queue_purge(&hard_iface->bat_v.aggr_list); hard_iface->bat_v.aggr_len = 0; } /** * batadv_v_ogm_aggr_send() - flush & send aggregation queue * @hard_iface: the interface with the aggregation queue to flush * * Aggregates all OGMv2 packets currently in the aggregation queue into a * single OGMv2 packet and transmits this aggregate. * * The aggregation queue is empty after this call. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) { unsigned int aggr_len = hard_iface->bat_v.aggr_len; struct sk_buff *skb_aggr; unsigned int ogm_len; struct sk_buff *skb; lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock); if (!aggr_len) return; skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN); if (!skb_aggr) { batadv_v_ogm_aggr_list_free(hard_iface); return; } skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN); skb_reset_network_header(skb_aggr); while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) { hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb); ogm_len = batadv_v_ogm_len(skb); skb_put_data(skb_aggr, skb->data, ogm_len); consume_skb(skb); } batadv_v_ogm_send_to_if(skb_aggr, hard_iface); } /** * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface * @skb: the OGM to queue * @hard_iface: the interface to queue the OGM on */ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (!atomic_read(&bat_priv->aggregated_ogms)) { batadv_v_ogm_send_to_if(skb, hard_iface); return; } spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); if (!batadv_v_ogm_queue_left(skb, hard_iface)) batadv_v_ogm_aggr_send(hard_iface); hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb); __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; struct list_head *iter; int ogm_buff_len; u16 tvlv_len = 0; int ret; lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; ogm_buff = bat_priv->bat_v.ogm_buff; ogm_buff_len = bat_priv->bat_v.ogm_buff_len; /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff, &ogm_buff_len, BATADV_OGM2_HLEN); bat_priv->bat_v.ogm_buff = ogm_buff; bat_priv->bat_v.ogm_buff_len = ogm_buff_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len); if (!skb) goto reschedule; skb_reserve(skb, ETH_HLEN); skb_put_data(skb, ogm_buff, ogm_buff_len); ogm_packet = (struct batadv_ogm2_packet *)skb->data; ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno)); atomic_inc(&bat_priv->bat_v.ogm_seqno); ogm_packet->tvlv_len = htons(tvlv_len); /* broadcast on every interface */ rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, NULL, NULL); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 from ourselves on %s suppressed: %s\n", hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), ntohl(ogm_packet->throughput), ogm_packet->ttl, hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); if (!skb_tmp) { batadv_hardif_put(hard_iface); break; } batadv_v_ogm_queue_on_if(skb_tmp, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); consume_skb(skb); reschedule: batadv_v_ogm_start_timer(bat_priv); out: return; } /** * batadv_v_ogm_send() - periodic worker broadcasting the own OGM * @work: work queue item */ static void batadv_v_ogm_send(struct work_struct *work) { struct batadv_priv_bat_v *bat_v; struct batadv_priv *bat_priv; bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work); bat_priv = container_of(bat_v, struct batadv_priv, bat_v); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); batadv_v_ogm_send_meshif(bat_priv); mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { struct batadv_hard_iface_bat_v *batv; struct batadv_hard_iface *hard_iface; batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work); hard_iface = container_of(batv, struct batadv_hard_iface, bat_v); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_send(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_start_queue_timer(hard_iface); } /** * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); return 0; } /** * batadv_v_ogm_iface_disable() - release OGM interface private resources * @hard_iface: interface for which the resources have to be released */ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq); spin_lock_bh(&hard_iface->bat_v.aggr_list.lock); batadv_v_ogm_aggr_list_free(hard_iface); spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock); } /** * batadv_v_ogm_primary_iface_set() - set a new primary interface * @primary_iface: the new primary interface */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface); struct batadv_ogm2_packet *ogm_packet; mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); if (!bat_priv->bat_v.ogm_buff) goto unlock; ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff; ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr); unlock: mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } /** * batadv_v_forward_penalty() - apply a penalty to the throughput metric * forwarded with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the mesh interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the * characteristic of the interface where the OGM has been received. * * Initially the per hardif hop penalty is applied to the throughput. After * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, u32 throughput) { int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; /* Apply per hardif hop penalty */ throughput = throughput * (hop_penalty_max - if_hop_penalty) / hop_penalty_max; /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; /* Forwarding on the same WiFi interface cuts the throughput in half * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ if (throughput > 10 && if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; /* hop penalty of 255 equals 100% */ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max; } /** * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface * @bat_priv: the bat priv with all the mesh interface information * @ogm_received: previously received OGM to be forwarded * @orig_node: the originator which has been updated * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * * Forward an OGM to an interface after having altered the throughput metric and * the TTL value contained in it. The original OGM isn't modified. */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; /* only forward for specific interfaces, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; /* acquire possibly updated router */ router = batadv_orig_router_get(orig_node, if_outgoing); /* strict rule: forward packets coming from the best next hop only */ if (neigh_node != router) goto out; /* don't forward the same seqno twice on one interface */ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) goto out; orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); goto out; } neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put_data(skb, ogm_received, packet_len); /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_queue_on_if(skb, if_outgoing); out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); } /** * batadv_v_ogm_metric_update() - update route metric based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: * 1 if the OGM is new, * 0 if it is not new but valid, * <0 on error (e.g. old OGM) */ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; bool protection_started = false; int ret = -EINVAL; u32 path_throughput; s32 seq_diff; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out; seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno; if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_OGM_MAX_AGE, &orig_ifinfo->batman_seqno_reset, &protection_started)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within window protection time from %pM\n", ogm2->orig); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Last reset: %ld, %ld\n", orig_ifinfo->batman_seqno_reset, jiffies); goto out; } /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno); orig_ifinfo->last_ttl = ogm2->ttl; neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming, if_outgoing, ntohl(ogm2->throughput)); neigh_ifinfo->bat_v.throughput = path_throughput; neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno); neigh_ifinfo->last_ttl = ogm2->ttl; if (seq_diff > 0 || protection_started) ret = 1; else ret = 0; out: batadv_orig_ifinfo_put(orig_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } /** * batadv_v_ogm_route_update() - update routes based on OGM * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered * * Return: true if the packet should be forwarded, false otherwise */ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; u32 router_throughput, neigh_throughput; u32 router_last_seqno; u32 neigh_last_seqno; s32 neigh_seq_diff; bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router && router->orig_node != orig_node && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } /* Mark the OGM to be considered for forwarding, and update routes * if needed. */ forward = true; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Searching and updating originator entry of received packet\n"); /* if this neighbor already is our next hop there is nothing * to change */ if (router == neigh_node) goto out; /* don't consider neighbours with worse throughput. * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than * the last received seqno from our best next hop. */ if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); /* if these are not allocated, something is wrong. */ if (!router_ifinfo || !neigh_ifinfo) goto out; neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; router_last_seqno = router_ifinfo->bat_v.last_seqno; neigh_seq_diff = neigh_last_seqno - router_last_seqno; router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && router_throughput >= neigh_throughput) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: batadv_neigh_node_put(router); batadv_neigh_node_put(orig_neigh_router); batadv_orig_node_put(orig_neigh_node); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } /** * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { int seqno_age; bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* outdated sequence numbers are to be discarded */ if (seqno_age < 0) return; /* only unknown & newer OGMs contain TVLVs we are interested in */ if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node, NULL, (unsigned char *)(ogm2 + 1), ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); /* if the routes have been processed correctly, check and forward */ if (forward) batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, if_incoming, if_outgoing); } /** * batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm2_packet: potential OGM2 in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm2_packet *ogm2_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm2_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); return next_buff_pos <= packet_len; } /** * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; u32 ogm_throughput, link_throughput, path_throughput; struct list_head *iter; int ret; ethhdr = eth_hdr(skb); ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset); ogm_throughput = ntohl(ogm_packet->throughput); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, throughput %u, TTL %u, V %u, tvlv_len %u)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl, ogm_packet->version, ntohs(ogm_packet->tvlv_len)); if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from ourself\n"); return; } /* If the throughput metric is 0, immediately drop the packet. No need * to create orig_node / neigh_node for an unusable route. */ if (ogm_throughput == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with throughput metric of 0\n"); return; } /* require ELP packets be to received from this neighbor first */ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (!hardif_neigh) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out; } orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) goto out; neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, ethhdr->h_source); if (!neigh_node) goto out; /* Update the received throughput metric to match the link * characteristic: * - If this OGM traveled one hop so far (emitted by single hop * neighbor) the path throughput metric equals the link throughput. * - For OGMs traversing more than hop the path throughput metric is * the smaller of the path throughput and the link throughput. */ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); path_throughput = min_t(u32, link_throughput, ogm_throughput); ogm_packet->throughput = htonl(path_throughput); batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; ret = batadv_hardif_no_broadcast(hard_iface, ogm_packet->orig, hardif_neigh->orig); if (ret) { char *type; switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "OGM2 packet from %pM on %s suppressed: %s\n", ogm_packet->orig, hard_iface->net_dev->name, type); batadv_hardif_put(hard_iface); continue; } batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: batadv_orig_node_put(orig_node); batadv_neigh_node_put(neigh_node); batadv_hardif_neigh_put(hardif_neigh); } /** * batadv_v_ogm_packet_recv() - OGM2 receiving handler * @skb: the received OGM * @if_incoming: the interface where this OGM has been received * * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP * (without freeing the skb) on failure */ int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm2_packet *ogm_packet; struct ethhdr *ethhdr; int ogm_offset; u8 *packet_pos; int ret = NET_RX_DROP; /* did we receive a OGM2 packet on an interface that does not have * B.A.T.M.A.N. V enabled ? */ if (strcmp(bat_priv->algo_ops->name, "BATMAN_V") != 0) goto free_skb; if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN)) goto free_skb; ethhdr = eth_hdr(skb); if (batadv_is_my_mac(bat_priv, ethhdr->h_source)) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm2_packet *)skb->data; while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_v_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM2_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm2_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_v_ogm_init() - initialise the OGM2 engine * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or a negative error code in case of failure */ int batadv_v_ogm_init(struct batadv_priv *bat_priv) { struct batadv_ogm2_packet *ogm_packet; unsigned char *ogm_buff; u32 random_seqno; bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN; ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) return -ENOMEM; bat_priv->bat_v.ogm_buff = ogm_buff; ogm_packet = (struct batadv_ogm2_packet *)ogm_buff; ogm_packet->packet_type = BATADV_OGM2; ogm_packet->version = BATADV_COMPAT_VERSION; ogm_packet->ttl = BATADV_TTL; ogm_packet->flags = BATADV_NO_FLAGS; ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno); INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send); mutex_init(&bat_priv->bat_v.ogm_buff_mutex); return 0; } /** * batadv_v_ogm_free() - free OGM private resources * @bat_priv: the bat priv with all the mesh interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); kfree(bat_priv->bat_v.ogm_buff); bat_priv->bat_v.ogm_buff = NULL; bat_priv->bat_v.ogm_buff_len = 0; mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } |
| 35813 3056 35373 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/debugfs.h> #include <linux/error-injection.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
| 12 12 509 236 68 24 10 1561 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ #ifndef _EXT4_EXTENTS #define _EXT4_EXTENTS #include "ext4.h" /* * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks * becomes very small, so index split, in-depth growing and * other hard changes happen much more often. * This is for debug purposes only. */ #define AGGRESSIVE_TEST_ /* * With EXTENTS_STATS defined, the number of blocks and extents * are collected in the truncate path. They'll be shown at * umount time. */ #define EXTENTS_STATS__ /* * If CHECK_BINSEARCH is defined, then the results of the binary search * will also be checked by linear search. */ #define CHECK_BINSEARCH__ /* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. * For non-inode extent blocks, ext4_extent_tail * follows the array. */ /* * This is the extent tail on-disk structure. * All other extent structures are 12 bytes long. It turns out that * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which * covers all valid ext4 block sizes. Therefore, this tail structure can be * crammed into the end of the block without having to rebalance the tree. */ struct ext4_extent_tail { __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ }; /* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ struct ext4_extent { __le32 ee_block; /* first logical block extent covers */ __le16 ee_len; /* number of blocks covered by extent */ __le16 ee_start_hi; /* high 16 bits of physical block */ __le32 ee_start_lo; /* low 32 bits of physical block */ }; /* * This is index on-disk structure. * It's used at all the levels except the bottom. */ struct ext4_extent_idx { __le32 ei_block; /* index covers logical blocks from 'block' */ __le32 ei_leaf_lo; /* pointer to the physical block of the next * * level. leaf or next index could be there */ __le16 ei_leaf_hi; /* high 16 bits of physical block */ __u16 ei_unused; }; /* * Each block (leaves and indexes), even inode-stored has header. */ struct ext4_extent_header { __le16 eh_magic; /* probably will support different formats */ __le16 eh_entries; /* number of valid entries */ __le16 eh_max; /* capacity of store in entries */ __le16 eh_depth; /* has tree real underlying blocks? */ __le32 eh_generation; /* generation of the tree */ }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) #define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) static inline struct ext4_extent_tail * find_ext4_extent_tail(struct ext4_extent_header *eh) { return (struct ext4_extent_tail *)(((void *)eh) + EXT4_EXTENT_TAIL_OFFSET(eh)); } /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. * Truncate uses it to simulate recursive walking. */ struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; struct buffer_head *p_bh; }; /* * Used to record a portion of a cluster found at the beginning or end * of an extent while traversing the extent tree during space removal. * A partial cluster may be removed if it does not contain blocks shared * with extents that aren't being deleted (tofree state). Otherwise, * it cannot be removed (nofree state). */ struct partial_cluster { ext4_fsblk_t pclu; /* physical cluster number */ ext4_lblk_t lblk; /* logical block number within logical cluster */ enum {initial, tofree, nofree} state; }; /* * structure for external API */ /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this * particular extent is an initialized extent or an unwritten (i.e. * preallocated). * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an * unwritten one. In other words, if MSB of ee_len is set, it is an * unwritten extent with only one special scenario when ee_len = 0x8000. * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) #define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ ((struct ext4_extent *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_FIRST_INDEX(__hdr__) \ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_HAS_FREE_INDEX(__path__) \ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ < le16_to_cpu((__path__)->p_hdr->eh_max)) #define EXT_LAST_EXTENT(__hdr__) \ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { return (struct ext4_extent_header *) EXT4_I(inode)->i_data; } static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) { return (struct ext4_extent_header *) bh->b_data; } static inline unsigned short ext_depth(struct inode *inode) { return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); } static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) { return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? le16_to_cpu(ext->ee_len) : (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) { ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } /* * ext4_ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; block = le32_to_cpu(ex->ee_start_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; return block; } /* * ext4_idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) { ext4_fsblk_t block; block = le32_to_cpu(ix->ei_leaf_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; return block; } /* * ext4_ext_store_pblock: * stores a large physical block number into an extent struct, * breaking it into parts */ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) { ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } /* * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts */ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) { ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } #endif /* _EXT4_EXTENTS */ |
| 290 30170 237 1756 1590 244 382 730 33452 504 1598 30369 1864 730 382 33517 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I/O iterator iteration building functions. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_IOV_ITER_H #define _LINUX_IOV_ITER_H #include <linux/uio.h> #include <linux/bvec.h> #include <linux/folio_queue.h> typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len, void *priv, void *priv2); /* * Handle ITER_UBUF. */ static __always_inline size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { void __user *base = iter->ubuf; size_t progress = 0, remain; remain = step(base + iter->iov_offset, 0, len, priv, priv2); progress = len - remain; iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_IOVEC. */ static __always_inline size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { const struct iovec *p = iter->__iov; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->__iov; iter->__iov = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_KVEC. */ static __always_inline size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct kvec *p = iter->kvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->kvec; iter->kvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_BVEC. */ static __always_inline size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct bio_vec *p = iter->bvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t offset = p->bv_offset + skip, part; void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE); part = min3(len, (size_t)(p->bv_len - skip), (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); kunmap_local(kaddr); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= p->bv_len) { skip = 0; p++; } if (remain) break; } while (len); iter->nr_segs -= p - iter->bvec; iter->bvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_FOLIOQ. */ static __always_inline size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct folio_queue *folioq = iter->folioq; unsigned int slot = iter->folioq_slot; size_t progress = 0, skip = iter->iov_offset; if (slot == folioq_nr_slots(folioq)) { /* The iterator may have been extended. */ folioq = folioq->next; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t part, remain = 0, consumed; size_t fsize; void *base; if (!folio) break; fsize = folioq_folio_size(folioq, slot); if (skip < fsize) { base = kmap_local_folio(folio, skip); part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; } if (skip >= fsize) { skip = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } if (remain) break; } while (len); iter->folioq_slot = slot; iter->folioq = folioq; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_XARRAY. */ static __always_inline size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { struct folio *folio; size_t progress = 0; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; XA_STATE(xas, iter->xarray, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { size_t remain, consumed, offset, part, flen; if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start + progress); flen = min(folio_size(folio) - offset, len); while (flen) { void *base = kmap_local_folio(folio, offset); part = min_t(size_t, flen, PAGE_SIZE - offset_in_page(offset)); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; progress += consumed; len -= consumed; if (remain || len == 0) goto out; flen -= consumed; offset += consumed; } } out: rcu_read_unlock(); iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_DISCARD. */ static __always_inline size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { size_t progress = len; iter->count -= progress; return progress; } /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * Two step functions, @step and @ustep, must be provided, one for handling * mapped kernel addresses and the other is given user addresses which have the * potential to fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f ustep, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (likely(iter_is_ubuf(iter))) return iterate_ubuf(iter, len, priv, priv2, ustep); if (likely(iter_is_iovec(iter))) return iterate_iovec(iter, len, priv, priv2, ustep); if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } /** * iterate_and_advance - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * As iterate_and_advance2(), but priv2 is always NULL. */ static __always_inline size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, iov_ustep_f ustep, iov_step_f step) { return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } /** * iterate_and_advance_kernel - Iterate over a kernel-internal iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type * iterators; it will not handle UBUF or IOVEC-type iterators. * * A step functions, @step, must be provided, one for handling mapped kernel * addresses and the other is given user addresses which have the potential to * fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } #endif /* _LINUX_IOV_ITER_H */ |
| 20 30 47 47 3 24 23 53 24 23 37 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | /* * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_INT_H #define _TLS_INT_H #include <asm/byteorder.h> #include <linux/types.h> #include <linux/skmsg.h> #include <net/tls.h> #include <net/tls_prot.h> #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) #define __TLS_INC_STATS(net, field) \ __SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_INC_STATS(net, field) \ SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_DEC_STATS(net, field) \ SNMP_DEC_STATS((net)->mib.tls_statistics, field) struct tls_cipher_desc { unsigned int nonce; unsigned int iv; unsigned int key; unsigned int salt; unsigned int tag; unsigned int rec_seq; unsigned int iv_offset; unsigned int key_offset; unsigned int salt_offset; unsigned int rec_seq_offset; char *cipher_name; bool offloadable; size_t crypto_info; }; #define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 #define TLS_CIPHER_MAX TLS_CIPHER_ARIA_GCM_256 extern const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type) { if (cipher_type < TLS_CIPHER_MIN || cipher_type > TLS_CIPHER_MAX) return NULL; return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN]; } static inline char *crypto_info_iv(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->iv_offset; } static inline char *crypto_info_key(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->key_offset; } static inline char *crypto_info_salt(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->salt_offset; } static inline char *crypto_info_rec_seq(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->rec_seq_offset; } /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are * stores in a linked list. */ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; struct sk_msg msg_plaintext; struct sk_msg msg_encrypted; /* AAD | msg_plaintext.sg.data | sg_tag */ struct scatterlist sg_aead_in[2]; /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ struct scatterlist sg_aead_out[2]; char content_type; struct scatterlist sg_content_type; struct sock *sk; char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; }; int __net_init tls_proc_init(struct net *net); void __net_exit tls_proc_fini(struct net *net); struct tls_context *tls_ctx_create(struct sock *sk); void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); int tls_set_sw_offload(struct sock *sk, int tx, struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_sw_splice_eof(struct socket *sock); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); int tls_tx_records(struct sock *sk, int flags); void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); int tls_process_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); int decrypt_skb(struct sock *sk, struct scatterlist *sgout); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); int tls_strp_dev_init(void); void tls_strp_dev_exit(void); void tls_strp_done(struct tls_strparser *strp); void tls_strp_stop(struct tls_strparser *strp); int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); void tls_strp_check_rcv(struct tls_strparser *strp); void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); void tls_rx_msg_ready(struct tls_strparser *strp); bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); static inline struct tls_msg *tls_msg(struct sk_buff *skb) { struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb; return &scb->tls; } static inline struct sk_buff *tls_strp_msg(struct tls_sw_context_rx *ctx) { DEBUG_NET_WARN_ON_ONCE(!ctx->strp.msg_ready || !ctx->strp.anchor->len); return ctx->strp.anchor; } static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx) { return READ_ONCE(ctx->strp.msg_ready); } static inline bool tls_strp_msg_mixed_decrypted(struct tls_sw_context_rx *ctx) { return ctx->strp.mixed_decrypted; } #ifdef CONFIG_TLS_DEVICE int tls_device_init(void); void tls_device_cleanup(void); int tls_set_device_offload(struct sock *sk); void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx); #else static inline int tls_device_init(void) { return 0; } static inline void tls_device_cleanup(void) {} static inline int tls_set_device_offload(struct sock *sk) { return -EOPNOTSUPP; } static inline void tls_device_free_resources_tx(struct sock *sk) {} static inline int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { return -EOPNOTSUPP; } static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} static inline void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} static inline int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { return 0; } #endif int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, int flags); void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); static inline bool tls_is_partially_sent_record(struct tls_context *ctx) { return !!ctx->partially_sent_record; } static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) { return tls_ctx->pending_open_record_frags; } static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; for (i = len - 1; i >= 0; i--) { ++seq[i]; if (seq[i] != 0) break; } return (i == -1); } static inline void tls_bigint_subtract(unsigned char *seq, int n) { u64 rcd_sn; __be64 *p; BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8); p = (__be64 *)seq; rcd_sn = be64_to_cpu(*p); *p = cpu_to_be64(rcd_sn - n); } static inline void tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) tls_bigint_increment(ctx->iv + prot->salt_size, prot->iv_size); } static inline void tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq) { int i; if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { for (i = 0; i < 8; i++) iv[i + 4] ^= seq[i]; } } static inline void tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; size_t pkt_len, iv_size = prot->iv_size; pkt_len = plaintext_len + prot->tag_size; if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) { pkt_len += iv_size; memcpy(buf + TLS_NONCE_OFFSET, ctx->tx.iv + prot->salt_size, iv_size); } /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ buf[1] = TLS_1_2_VERSION_MINOR; buf[2] = TLS_1_2_VERSION_MAJOR; /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; } static inline void tls_make_aad(char *buf, size_t size, char *record_sequence, unsigned char record_type, struct tls_prot_info *prot) { if (prot->version != TLS_1_3_VERSION) { memcpy(buf, record_sequence, prot->rec_seq_size); buf += 8; } else { size += prot->tag_size; } buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; buf[1] = TLS_1_2_VERSION_MAJOR; buf[2] = TLS_1_2_VERSION_MINOR; buf[3] = size >> 8; buf[4] = size & 0xFF; } #endif |
| 10 1 3 6 7 4 4 4685 1216 4685 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) { if (dst_xfrm(dst)) return FLOW_OFFLOAD_XMIT_XFRM; return FLOW_OFFLOAD_XMIT_NEIGH; } static void nft_default_forward_path(struct nf_flow_route *route, struct dst_entry *dst_cache, enum ip_conntrack_dir dir) { route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; route->tuple[dir].dst = dst_cache; route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } static bool nft_is_valid_ether_device(const struct net_device *dev) { if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return false; return true; } static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, enum ip_conntrack_dir dir, u8 *ha, struct net_device_path_stack *stack) { const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; struct net_device *dev = dst_cache->dev; struct neighbour *n; u8 nud_state; if (!nft_is_valid_ether_device(dev)) goto out; n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -1; read_lock_bh(&n->lock); nud_state = n->nud_state; ether_addr_copy(ha, n->ha); read_unlock_bh(&n->lock); neigh_release(n); if (!(nud_state & NUD_VALID)) return -1; out: return dev_fill_forward_path(dev, ha, stack); } struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; const struct net_device *hw_outdev; struct id { __u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; u8 num_encaps; u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; enum flow_offload_xmit_type xmit_type; }; static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, unsigned char *ha, struct nf_flowtable *flowtable) { const struct net_device_path *path; int i; memcpy(info->h_dest, ha, ETH_ALEN); for (i = 0; i < stack->num_paths; i++) { path = &stack->path[i]; switch (path->type) { case DEV_PATH_ETHERNET: case DEV_PATH_DSA: case DEV_PATH_VLAN: case DEV_PATH_PPPOE: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); if (path->type == DEV_PATH_ETHERNET) break; if (path->type == DEV_PATH_DSA) { i = stack->num_paths; break; } /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { info->indev = NULL; break; } if (!info->outdev) info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; if (path->type == DEV_PATH_PPPOE) memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); switch (path->bridge.vlan_mode) { case DEV_PATH_BR_VLAN_UNTAG_HW: info->ingress_vlans |= BIT(info->num_encaps - 1); break; case DEV_PATH_BR_VLAN_TAG: info->encap[info->num_encaps].id = path->bridge.vlan_id; info->encap[info->num_encaps].proto = path->bridge.vlan_proto; info->num_encaps++; break; case DEV_PATH_BR_VLAN_UNTAG: info->num_encaps--; break; case DEV_PATH_BR_VLAN_KEEP: break; } info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; break; default: info->indev = NULL; break; } } if (!info->outdev) info->outdev = info->indev; info->hw_outdev = info->indev; if (nf_flowtable_hw_offload(flowtable) && nft_is_valid_ether_device(info->indev)) info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; } static bool nft_flowtable_find_dev(const struct net_device *dev, struct nft_flowtable *ft) { struct nft_hook *hook; bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; break; } return found; } static void nft_dev_forward_path(struct nf_flow_route *route, const struct nf_conn *ct, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { const struct dst_entry *dst = route->tuple[dir].dst; struct net_device_path_stack stack; struct nft_forward_info info = {}; unsigned char ha[ETH_ALEN]; int i; if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) nft_dev_path_info(&stack, &info, ha, &ft->data); if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) return; route->tuple[!dir].in.ifindex = info.indev->ifindex; for (i = 0; i < info.num_encaps; i++) { route->tuple[!dir].in.encap[i].id = info.encap[i].id; route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } route->tuple[!dir].in.num_encaps = info.num_encaps; route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].out.ifindex = info.outdev->ifindex; route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } } static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { struct dst_entry *this_dst = skb_dst(pkt->skb); struct dst_entry *other_dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(fl)); switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; case NFPROTO_IPV6: fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); fl.u.ip6.flowi6_mark = pkt->skb->mark; fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; break; } if (!dst_hold_safe(this_dst)) return -ENOENT; nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); if (!other_dst) { dst_release(this_dst); return -ENOENT; } nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { nft_dev_forward_path(route, ct, dir, ft); nft_dev_forward_path(route, ct, !dir, ft); } return 0; } static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module"); |
| 69 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) { if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ return 1; sprintf(strbuf, "%pM", addr->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } /* Convert discovery msg addr format to Ethernet media addr format */ static int tipc_eth_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { /* Skip past preamble: */ msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } /* Ethernet media registration info */ struct tipc_media eth_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_eth_addr2str, .addr2msg = tipc_eth_addr2msg, .msg2addr = tipc_eth_msg2addr, .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" }; |
| 3 4684 4516 191 7 7 2 3 3 3 3 5 1 1 1 2 1 1 5 1 1 1 2 2 1 1 5 8 1 1 1 5 5 2 3 3 3 7 1 3 2 1 1 2 4 4 4 1 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 62 14 1 1 1 5 2 1 2 1 17 1 2 1 3 4 5 3 5 2 6 1 5 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 5 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 | // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25); |
| 7 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/lapb.h> /* * This routine purges all the queues of frames. */ void lapb_clear_queues(struct lapb_cb *lapb) { skb_queue_purge(&lapb->write_queue); skb_queue_purge(&lapb->ack_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr) { struct sk_buff *skb; int modulus; modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; /* * Remove all the ack-ed frames from the ack queue. */ if (lapb->va != nr) while (skb_peek(&lapb->ack_queue) && lapb->va != nr) { skb = skb_dequeue(&lapb->ack_queue); kfree_skb(skb); lapb->va = (lapb->va + 1) % modulus; } } void lapb_requeue_frames(struct lapb_cb *lapb) { struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by lapb_kick called from the timer. This arrangement handles the * possibility of an empty output queue. */ while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) { if (!skb_prev) skb_queue_head(&lapb->write_queue, skb); else skb_append(skb_prev, skb, &lapb->write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr) { unsigned short vc = lapb->va; int modulus; modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; while (vc != lapb->vs) { if (nr == vc) return 1; vc = (vc + 1) % modulus; } return nr == lapb->vs; } /* * This routine is the centralised routine for parsing the control * information for the different frame formats. */ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb, struct lapb_frame *frame) { frame->type = LAPB_ILLEGAL; lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data); /* We always need to look at 2 bytes, sometimes we need * to look at 3 and those cases are handled below. */ if (!pskb_may_pull(skb, 2)) return -1; if (lapb->mode & LAPB_MLP) { if (lapb->mode & LAPB_DCE) { if (skb->data[0] == LAPB_ADDR_D) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_C) frame->cr = LAPB_RESPONSE; } else { if (skb->data[0] == LAPB_ADDR_C) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_D) frame->cr = LAPB_RESPONSE; } } else { if (lapb->mode & LAPB_DCE) { if (skb->data[0] == LAPB_ADDR_B) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_A) frame->cr = LAPB_RESPONSE; } else { if (skb->data[0] == LAPB_ADDR_A) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_B) frame->cr = LAPB_RESPONSE; } } skb_pull(skb, 1); if (lapb->mode & LAPB_EXTENDED) { if (!(skb->data[0] & LAPB_S)) { if (!pskb_may_pull(skb, 2)) return -1; /* * I frame - carries NR/NS/PF */ frame->type = LAPB_I; frame->ns = (skb->data[0] >> 1) & 0x7F; frame->nr = (skb->data[1] >> 1) & 0x7F; frame->pf = skb->data[1] & LAPB_EPF; frame->control[0] = skb->data[0]; frame->control[1] = skb->data[1]; skb_pull(skb, 2); } else if ((skb->data[0] & LAPB_U) == 1) { if (!pskb_may_pull(skb, 2)) return -1; /* * S frame - take out PF/NR */ frame->type = skb->data[0] & 0x0F; frame->nr = (skb->data[1] >> 1) & 0x7F; frame->pf = skb->data[1] & LAPB_EPF; frame->control[0] = skb->data[0]; frame->control[1] = skb->data[1]; skb_pull(skb, 2); } else if ((skb->data[0] & LAPB_U) == 3) { /* * U frame - take out PF */ frame->type = skb->data[0] & ~LAPB_SPF; frame->pf = skb->data[0] & LAPB_SPF; frame->control[0] = skb->data[0]; frame->control[1] = 0x00; skb_pull(skb, 1); } } else { if (!(skb->data[0] & LAPB_S)) { /* * I frame - carries NR/NS/PF */ frame->type = LAPB_I; frame->ns = (skb->data[0] >> 1) & 0x07; frame->nr = (skb->data[0] >> 5) & 0x07; frame->pf = skb->data[0] & LAPB_SPF; } else if ((skb->data[0] & LAPB_U) == 1) { /* * S frame - take out PF/NR */ frame->type = skb->data[0] & 0x0F; frame->nr = (skb->data[0] >> 5) & 0x07; frame->pf = skb->data[0] & LAPB_SPF; } else if ((skb->data[0] & LAPB_U) == 3) { /* * U frame - take out PF */ frame->type = skb->data[0] & ~LAPB_SPF; frame->pf = skb->data[0] & LAPB_SPF; } frame->control[0] = skb->data[0]; skb_pull(skb, 1); } return 0; } /* * This routine is called when the HDLC layer internally generates a * command or response for the remote machine ( eg. RR, UA etc. ). * Only supervisory or unnumbered frames are processed, FRMRs are handled * by lapb_transmit_frmr below. */ void lapb_send_control(struct lapb_cb *lapb, int frametype, int poll_bit, int type) { struct sk_buff *skb; unsigned char *dptr; if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL) return; skb_reserve(skb, LAPB_HEADER_LEN + 1); if (lapb->mode & LAPB_EXTENDED) { if ((frametype & LAPB_U) == LAPB_U) { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= poll_bit ? LAPB_SPF : 0; } else { dptr = skb_put(skb, 2); dptr[0] = frametype; dptr[1] = (lapb->vr << 1); dptr[1] |= poll_bit ? LAPB_EPF : 0; } } else { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= poll_bit ? LAPB_SPF : 0; if ((frametype & LAPB_U) == LAPB_S) /* S frames carry NR */ *dptr |= (lapb->vr << 5); } lapb_transmit_buffer(lapb, skb, type); } /* * This routine generates FRMRs based on information previously stored in * the LAPB control block. */ void lapb_transmit_frmr(struct lapb_cb *lapb) { struct sk_buff *skb; unsigned char *dptr; if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL) return; skb_reserve(skb, LAPB_HEADER_LEN + 1); if (lapb->mode & LAPB_EXTENDED) { dptr = skb_put(skb, 6); *dptr++ = LAPB_FRMR; *dptr++ = lapb->frmr_data.control[0]; *dptr++ = lapb->frmr_data.control[1]; *dptr++ = (lapb->vs << 1) & 0xFE; *dptr = (lapb->vr << 1) & 0xFE; if (lapb->frmr_data.cr == LAPB_RESPONSE) *dptr |= 0x01; dptr++; *dptr++ = lapb->frmr_type; lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n", lapb->dev, lapb->state, &skb->data[1]); } else { dptr = skb_put(skb, 4); *dptr++ = LAPB_FRMR; *dptr++ = lapb->frmr_data.control[0]; *dptr = (lapb->vs << 1) & 0x0E; *dptr |= (lapb->vr << 5) & 0xE0; if (lapb->frmr_data.cr == LAPB_RESPONSE) *dptr |= 0x10; dptr++; *dptr++ = lapb->frmr_type; lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n", lapb->dev, lapb->state, &skb->data[1]); } lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE); } |
| 298 44 257 291 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/random.h> #include <linux/export.h> #include "rds.h" /* * All of connection management is simplified by serializing it through * work queues that execute in a connection managing thread. * * TCP wants to send acks through sendpage() in response to data_ready(), * but it needs a process context to do so. * * The receive paths need to allocate but can't drop packets (!) so we have * a thread around to block allocating if the receive fast path sees an * allocation failure. */ /* Grand Unified Theory of connection life cycle: * At any point in time, the connection can be in one of these states: * DOWN, CONNECTING, UP, DISCONNECTING, ERROR * * The following transitions are possible: * ANY -> ERROR * UP -> DISCONNECTING * ERROR -> DISCONNECTING * DISCONNECTING -> DOWN * DOWN -> CONNECTING * CONNECTING -> UP * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport * (TCP, IB/RDMA) to provide the necessary synchronisation. */ struct workqueue_struct *rds_wq; EXPORT_SYMBOL_GPL(rds_wq); void rds_connect_path_complete(struct rds_conn_path *cp, int curr) { if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { printk(KERN_WARNING "%s: Cannot transition to state UP, " "current state is %d\n", __func__, atomic_read(&cp->cp_state)); rds_conn_path_drop(cp, false); return; } rdsdebug("conn %p for %pI6c to %pI6c complete\n", cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) { queue_delayed_work(rds_wq, &cp->cp_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } rcu_read_unlock(); cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; } EXPORT_SYMBOL_GPL(rds_connect_path_complete); void rds_connect_complete(struct rds_connection *conn) { rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING); } EXPORT_SYMBOL_GPL(rds_connect_complete); /* * This random exponential backoff is relied on to eventually resolve racing * connects. * * If connect attempts race then both parties drop both connections and come * here to wait for a random amount of time before trying again. Eventually * the backoff range will be so much greater than the time it takes to * establish a connection that one of the pair will establish the connection * before the other's random delay fires. * * Connection attempts that arrive while a connection is already established * are also considered to be racing connects. This lets a connection from * a rebooted machine replace an existing stale connection before the transport * notices that the connection has failed. * * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ void rds_queue_reconnect(struct rds_conn_path *cp) { unsigned long rand; struct rds_connection *conn = cp->cp_conn; rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", conn, &conn->c_laddr, &conn->c_faddr, cp->cp_reconnect_jiffies); /* let peer with smaller addr initiate reconnect, to avoid duels */ if (conn->c_trans->t_type == RDS_TRANS_TCP && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); return; } get_random_bytes(&rand, sizeof(rand)); rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, rand % cp->cp_reconnect_jiffies); rcu_read_unlock(); cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2, rds_sysctl_reconnect_max_jiffies); } void rds_connect_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_conn_w.work); struct rds_connection *conn = cp->cp_conn; int ret; if (cp->cp_index > 0 && rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); if (ret) { ret = conn->c_trans->conn_path_connect(cp); rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { if (rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) rds_queue_reconnect(cp); else rds_conn_path_error(cp, "connect failed\n"); } } } void rds_send_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_send_w.work); int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); ret = rds_send_xmit(cp); cond_resched(); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 2); break; default: break; } } } void rds_recv_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_recv_w.work); int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { ret = cp->cp_conn->c_trans->recv_path(cp); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); break; default: break; } } } void rds_shutdown_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_down_w); rds_conn_shutdown(cp); } void rds_threads_exit(void) { destroy_workqueue(rds_wq); } int rds_threads_init(void) { rds_wq = create_singlethread_workqueue("krdsd"); if (!rds_wq) return -ENOMEM; return 0; } /* Compare two IPv6 addresses. Return 0 if the two addresses are equal. * Return 1 if the first is greater. Return -1 if the second is greater. */ int rds_addr_cmp(const struct in6_addr *addr1, const struct in6_addr *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *a1, *a2; u64 x, y; a1 = (__be64 *)addr1; a2 = (__be64 *)addr2; if (*a1 != *a2) { if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) return -1; else return 1; } else { x = be64_to_cpu(*++a1); y = be64_to_cpu(*++a2); if (x < y) return -1; else if (x > y) return 1; else return 0; } #else u32 a, b; int i; for (i = 0; i < 4; i++) { if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { a = ntohl(addr1->s6_addr32[i]); b = ntohl(addr2->s6_addr32[i]); if (a < b) return -1; else if (a > b) return 1; } } return 0; #endif } EXPORT_SYMBOL_GPL(rds_addr_cmp); |
| 68 3 2 1 114 26 35 37 16 71 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> */ #include <linux/can/dev.h> #include <linux/module.h> #define MOD_DESC "CAN device driver interface" MODULE_DESCRIPTION(MOD_DESC); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>"); /* Local echo of CAN messages * * CAN network devices *should* support a local echo functionality * (see Documentation/networking/can.rst). To test the handling of CAN * interfaces that do not support the local echo both driver types are * implemented. In the case that the driver does not support the echo * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core * to perform the echo as a fallback solution. */ void can_flush_echo_skb(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; int i; for (i = 0; i < priv->echo_skb_max; i++) { if (priv->echo_skb[i]) { kfree_skb(priv->echo_skb[i]); priv->echo_skb[i] = NULL; stats->tx_dropped++; stats->tx_aborted_errors++; } } } /* Put the skb on the stack to be looped backed locally lateron * * The function is typically called in the start_xmit function * of the device driver. The driver must protect access to * priv->echo_skb, if necessary. */ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return -EINVAL; } /* check flag whether this packet has to be looped back */ if (!(dev->flags & IFF_ECHO) || (skb->protocol != htons(ETH_P_CAN) && skb->protocol != htons(ETH_P_CANFD) && skb->protocol != htons(ETH_P_CANXL))) { kfree_skb(skb); return 0; } if (!priv->echo_skb[idx]) { skb = can_create_echo_skb(skb); if (!skb) return -ENOMEM; /* make settings for echo to reduce code in irq context */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->dev = dev; /* save frame_len to reuse it when transmission is completed */ can_skb_prv(skb)->frame_len = frame_len; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; skb_tx_timestamp(skb); /* save this skb for tx interrupt echo handling */ priv->echo_skb[idx] = skb; } else { /* locking problem with netif_stop_queue() ?? */ netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); kfree_skb(skb); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(can_put_echo_skb); struct sk_buff * __can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return NULL; } if (priv->echo_skb[idx]) { /* Using "struct canfd_frame::len" for the frame * length is supported on both CAN and CANFD frames. */ struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) skb_tstamp_tx(skb, skb_hwtstamps(skb)); /* get the real payload length for netdev statistics */ *len_ptr = can_skb_get_data_len(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; priv->echo_skb[idx] = NULL; if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_BROADCAST; } else { dev_consume_skb_any(skb); return NULL; } return skb; } return NULL; } /* Get the skb from the stack and loop it back locally * * The function is typically called when the TX done interrupt * is handled in the device driver. The driver must protect * access to priv->echo_skb, if necessary. */ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct sk_buff *skb; unsigned int len; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; skb_get(skb); if (netif_rx(skb) == NET_RX_SUCCESS) dev_consume_skb_any(skb); else dev_kfree_skb_any(skb); return len; } EXPORT_SYMBOL_GPL(can_get_echo_skb); /* Remove the skb from the stack and free it. * * The function is typically called when TX failed. */ void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return; } if (priv->echo_skb[idx]) { struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; dev_kfree_skb_any(skb); priv->echo_skb[idx] = NULL; } } EXPORT_SYMBOL_GPL(can_free_echo_skb); /* fill common values for CAN sk_buffs */ static void init_can_skb_reserve(struct sk_buff *skb) { skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); can_skb_reserve(skb); can_skb_prv(skb)->skbcnt = 0; } struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct can_frame)); if (unlikely(!skb)) { *cf = NULL; return NULL; } skb->protocol = htons(ETH_P_CAN); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cf = skb_put_zero(skb, sizeof(struct can_frame)); return skb; } EXPORT_SYMBOL_GPL(alloc_can_skb); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct canfd_frame)); if (unlikely(!skb)) { *cfd = NULL; return NULL; } skb->protocol = htons(ETH_P_CANFD); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); /* set CAN FD flag by default */ (*cfd)->flags = CANFD_FDF; return skb; } EXPORT_SYMBOL_GPL(alloc_canfd_skb); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len) { struct sk_buff *skb; if (data_len < CANXL_MIN_DLEN || data_len > CANXL_MAX_DLEN) goto out_error; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + CANXL_HDR_SIZE + data_len); if (unlikely(!skb)) goto out_error; skb->protocol = htons(ETH_P_CANXL); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cxl = skb_put_zero(skb, CANXL_HDR_SIZE + data_len); /* set CAN XL flag and length information by default */ (*cxl)->flags = CANXL_XLF; (*cxl)->len = data_len; return skb; out_error: *cxl = NULL; return NULL; } EXPORT_SYMBOL_GPL(alloc_canxl_skb); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = alloc_can_skb(dev, cf); if (unlikely(!skb)) return NULL; (*cf)->can_id = CAN_ERR_FLAG; (*cf)->len = CAN_ERR_DLC; return skb; } EXPORT_SYMBOL_GPL(alloc_can_err_skb); /* Check for outgoing skbs that have not been created by the CAN subsystem */ static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb) { /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) return false; /* af_packet does not apply CAN skb specific settings */ if (skb->ip_summed == CHECKSUM_NONE) { /* init headroom */ can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else skb->pkt_type = PACKET_HOST; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); /* set CANFD_FDF flag for CAN FD frames */ if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd; cfd = (struct canfd_frame *)skb->data; cfd->flags |= CANFD_FDF; } } return true; } /* Drop a given socketbuffer if it does not contain a valid CAN frame. */ bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb) { switch (ntohs(skb->protocol)) { case ETH_P_CAN: if (!can_is_can_skb(skb)) goto inval_skb; break; case ETH_P_CANFD: if (!can_is_canfd_skb(skb)) goto inval_skb; break; case ETH_P_CANXL: if (!can_is_canxl_skb(skb)) goto inval_skb; break; default: goto inval_skb; } if (!can_skb_headroom_valid(dev, skb)) goto inval_skb; return false; inval_skb: kfree_skb(skb); dev->stats.tx_dropped++; return true; } EXPORT_SYMBOL_GPL(can_dropped_invalid_skb); |
| 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 | // SPDX-License-Identifier: GPL-2.0-or-later /* * (c) 1999 Andreas Gal <gal@cs.uni-magdeburg.de> * (c) 2000-2001 Vojtech Pavlik <vojtech@ucw.cz> * (c) 2007-2009 Jiri Kosina * * HID debugging support */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/kfifo.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/hid.h> #include <linux/hid-debug.h> static struct dentry *hid_debug_root; struct hid_usage_entry { unsigned page; unsigned usage; const char *description; }; static const struct hid_usage_entry hid_usage_table[] = { { 0x00, 0, "Undefined" }, { 0x01, 0, "GenericDesktop" }, { 0x01, 0x0001, "Pointer" }, { 0x01, 0x0002, "Mouse" }, { 0x01, 0x0004, "Joystick" }, { 0x01, 0x0005, "Gamepad" }, { 0x01, 0x0006, "Keyboard" }, { 0x01, 0x0007, "Keypad" }, { 0x01, 0x0008, "MultiaxisController" }, { 0x01, 0x0009, "TabletPCSystemControls" }, { 0x01, 0x000a, "WaterCoolingDevice" }, { 0x01, 0x000b, "ComputerChassisDevice" }, { 0x01, 0x000c, "WirelessRadioControls" }, { 0x01, 0x000d, "PortableDeviceControl" }, { 0x01, 0x000e, "SystemMultiAxisController" }, { 0x01, 0x000f, "SpatialController" }, { 0x01, 0x0010, "AssistiveControl" }, { 0x01, 0x0011, "DeviceDock" }, { 0x01, 0x0012, "DockableDevice" }, { 0x01, 0x0013, "CallStateManagementControl" }, { 0x01, 0x0030, "X" }, { 0x01, 0x0031, "Y" }, { 0x01, 0x0032, "Z" }, { 0x01, 0x0033, "Rx" }, { 0x01, 0x0034, "Ry" }, { 0x01, 0x0035, "Rz" }, { 0x01, 0x0036, "Slider" }, { 0x01, 0x0037, "Dial" }, { 0x01, 0x0038, "Wheel" }, { 0x01, 0x0039, "HatSwitch" }, { 0x01, 0x003a, "CountedBuffer" }, { 0x01, 0x003b, "ByteCount" }, { 0x01, 0x003c, "MotionWakeup" }, { 0x01, 0x003d, "Start" }, { 0x01, 0x003e, "Select" }, { 0x01, 0x0040, "Vx" }, { 0x01, 0x0041, "Vy" }, { 0x01, 0x0042, "Vz" }, { 0x01, 0x0043, "Vbrx" }, { 0x01, 0x0044, "Vbry" }, { 0x01, 0x0045, "Vbrz" }, { 0x01, 0x0046, "Vno" }, { 0x01, 0x0047, "FeatureNotification" }, { 0x01, 0x0048, "ResolutionMultiplier" }, { 0x01, 0x0049, "Qx" }, { 0x01, 0x004a, "Qy" }, { 0x01, 0x004b, "Qz" }, { 0x01, 0x004c, "Qw" }, { 0x01, 0x0080, "SystemControl" }, { 0x01, 0x0081, "SystemPowerDown" }, { 0x01, 0x0082, "SystemSleep" }, { 0x01, 0x0083, "SystemWakeUp" }, { 0x01, 0x0084, "SystemContextMenu" }, { 0x01, 0x0085, "SystemMainMenu" }, { 0x01, 0x0086, "SystemAppMenu" }, { 0x01, 0x0087, "SystemMenuHelp" }, { 0x01, 0x0088, "SystemMenuExit" }, { 0x01, 0x0089, "SystemMenuSelect" }, { 0x01, 0x008a, "SystemMenuRight" }, { 0x01, 0x008b, "SystemMenuLeft" }, { 0x01, 0x008c, "SystemMenuUp" }, { 0x01, 0x008d, "SystemMenuDown" }, { 0x01, 0x008e, "SystemColdRestart" }, { 0x01, 0x008f, "SystemWarmRestart" }, { 0x01, 0x0090, "DpadUp" }, { 0x01, 0x0091, "DpadDown" }, { 0x01, 0x0092, "DpadRight" }, { 0x01, 0x0093, "DpadLeft" }, { 0x01, 0x0094, "IndexTrigger" }, { 0x01, 0x0095, "PalmTrigger" }, { 0x01, 0x0096, "Thumbstick" }, { 0x01, 0x0097, "SystemFunctionShift" }, { 0x01, 0x0098, "SystemFunctionShiftLock" }, { 0x01, 0x0099, "SystemFunctionShiftLockIndicator" }, { 0x01, 0x009a, "SystemDismissNotification" }, { 0x01, 0x009b, "SystemDoNotDisturb" }, { 0x01, 0x00a0, "SystemDock" }, { 0x01, 0x00a1, "SystemUndock" }, { 0x01, 0x00a2, "SystemSetup" }, { 0x01, 0x00a3, "SystemBreak" }, { 0x01, 0x00a4, "SystemDebuggerBreak" }, { 0x01, 0x00a5, "ApplicationBreak" }, { 0x01, 0x00a6, "ApplicationDebuggerBreak" }, { 0x01, 0x00a7, "SystemSpeakerMute" }, { 0x01, 0x00a8, "SystemHibernate" }, { 0x01, 0x00a9, "SystemMicrophoneMute" }, { 0x01, 0x00b0, "SystemDisplayInvert" }, { 0x01, 0x00b1, "SystemDisplayInternal" }, { 0x01, 0x00b2, "SystemDisplayExternal" }, { 0x01, 0x00b3, "SystemDisplayBoth" }, { 0x01, 0x00b4, "SystemDisplayDual" }, { 0x01, 0x00b5, "SystemDisplayToggleIntExtMode" }, { 0x01, 0x00b6, "SystemDisplaySwapPrimarySecondary" }, { 0x01, 0x00b7, "SystemDisplayToggleLCDAutoscale" }, { 0x01, 0x00c0, "SensorZone" }, { 0x01, 0x00c1, "RPM" }, { 0x01, 0x00c2, "CoolantLevel" }, { 0x01, 0x00c3, "CoolantCriticalLevel" }, { 0x01, 0x00c4, "CoolantPump" }, { 0x01, 0x00c5, "ChassisEnclosure" }, { 0x01, 0x00c6, "WirelessRadioButton" }, { 0x01, 0x00c7, "WirelessRadioLED" }, { 0x01, 0x00c8, "WirelessRadioSliderSwitch" }, { 0x01, 0x00c9, "SystemDisplayRotationLockButton" }, { 0x01, 0x00ca, "SystemDisplayRotationLockSliderSwitch" }, { 0x01, 0x00cb, "ControlEnable" }, { 0x01, 0x00d0, "DockableDeviceUniqueID" }, { 0x01, 0x00d1, "DockableDeviceVendorID" }, { 0x01, 0x00d2, "DockableDevicePrimaryUsagePage" }, { 0x01, 0x00d3, "DockableDevicePrimaryUsageID" }, { 0x01, 0x00d4, "DockableDeviceDockingState" }, { 0x01, 0x00d5, "DockableDeviceDisplayOcclusion" }, { 0x01, 0x00d6, "DockableDeviceObjectType" }, { 0x01, 0x00e0, "CallActiveLED" }, { 0x01, 0x00e1, "CallMuteToggle" }, { 0x01, 0x00e2, "CallMuteLED" }, { 0x02, 0, "SimulationControls" }, { 0x02, 0x0001, "FlightSimulationDevice" }, { 0x02, 0x0002, "AutomobileSimulationDevice" }, { 0x02, 0x0003, "TankSimulationDevice" }, { 0x02, 0x0004, "SpaceshipSimulationDevice" }, { 0x02, 0x0005, "SubmarineSimulationDevice" }, { 0x02, 0x0006, "SailingSimulationDevice" }, { 0x02, 0x0007, "MotorcycleSimulationDevice" }, { 0x02, 0x0008, "SportsSimulationDevice" }, { 0x02, 0x0009, "AirplaneSimulationDevice" }, { 0x02, 0x000a, "HelicopterSimulationDevice" }, { 0x02, 0x000b, "MagicCarpetSimulationDevice" }, { 0x02, 0x000c, "BicycleSimulationDevice" }, { 0x02, 0x0020, "FlightControlStick" }, { 0x02, 0x0021, "FlightStick" }, { 0x02, 0x0022, "CyclicControl" }, { 0x02, 0x0023, "CyclicTrim" }, { 0x02, 0x0024, "FlightYoke" }, { 0x02, 0x0025, "TrackControl" }, { 0x02, 0x00b0, "Aileron" }, { 0x02, 0x00b1, "AileronTrim" }, { 0x02, 0x00b2, "AntiTorqueControl" }, { 0x02, 0x00b3, "AutopilotEnable" }, { 0x02, 0x00b4, "ChaffRelease" }, { 0x02, 0x00b5, "CollectiveControl" }, { 0x02, 0x00b6, "DiveBrake" }, { 0x02, 0x00b7, "ElectronicCountermeasures" }, { 0x02, 0x00b8, "Elevator" }, { 0x02, 0x00b9, "ElevatorTrim" }, { 0x02, 0x00ba, "Rudder" }, { 0x02, 0x00bb, "Throttle" }, { 0x02, 0x00bc, "FlightCommunications" }, { 0x02, 0x00bd, "FlareRelease" }, { 0x02, 0x00be, "LandingGear" }, { 0x02, 0x00bf, "ToeBrake" }, { 0x02, 0x00c0, "Trigger" }, { 0x02, 0x00c1, "WeaponsArm" }, { 0x02, 0x00c2, "WeaponsSelect" }, { 0x02, 0x00c3, "WingFlaps" }, { 0x02, 0x00c4, "Accelerator" }, { 0x02, 0x00c5, "Brake" }, { 0x02, 0x00c6, "Clutch" }, { 0x02, 0x00c7, "Shifter" }, { 0x02, 0x00c8, "Steering" }, { 0x02, 0x00c9, "TurretDirection" }, { 0x02, 0x00ca, "BarrelElevation" }, { 0x02, 0x00cb, "DivePlane" }, { 0x02, 0x00cc, "Ballast" }, { 0x02, 0x00cd, "BicycleCrank" }, { 0x02, 0x00ce, "HandleBars" }, { 0x02, 0x00cf, "FrontBrake" }, { 0x02, 0x00d0, "RearBrake" }, { 0x03, 0, "VRControls" }, { 0x03, 0x0001, "Belt" }, { 0x03, 0x0002, "BodySuit" }, { 0x03, 0x0003, "Flexor" }, { 0x03, 0x0004, "Glove" }, { 0x03, 0x0005, "HeadTracker" }, { 0x03, 0x0006, "HeadMountedDisplay" }, { 0x03, 0x0007, "HandTracker" }, { 0x03, 0x0008, "Oculometer" }, { 0x03, 0x0009, "Vest" }, { 0x03, 0x000a, "AnimatronicDevice" }, { 0x03, 0x0020, "StereoEnable" }, { 0x03, 0x0021, "DisplayEnable" }, { 0x04, 0, "SportControls" }, { 0x04, 0x0001, "BaseballBat" }, { 0x04, 0x0002, "GolfClub" }, { 0x04, 0x0003, "RowingMachine" }, { 0x04, 0x0004, "Treadmill" }, { 0x04, 0x0030, "Oar" }, { 0x04, 0x0031, "Slope" }, { 0x04, 0x0032, "Rate" }, { 0x04, 0x0033, "StickSpeed" }, { 0x04, 0x0034, "StickFaceAngle" }, { 0x04, 0x0035, "StickHeelToe" }, { 0x04, 0x0036, "StickFollowThrough" }, { 0x04, 0x0037, "StickTempo" }, { 0x04, 0x0038, "StickType" }, { 0x04, 0x0039, "StickHeight" }, { 0x04, 0x0050, "Putter" }, { 0x04, 0x0051, "1Iron" }, { 0x04, 0x0052, "2Iron" }, { 0x04, 0x0053, "3Iron" }, { 0x04, 0x0054, "4Iron" }, { 0x04, 0x0055, "5Iron" }, { 0x04, 0x0056, "6Iron" }, { 0x04, 0x0057, "7Iron" }, { 0x04, 0x0058, "8Iron" }, { 0x04, 0x0059, "9Iron" }, { 0x04, 0x005a, "10Iron" }, { 0x04, 0x005b, "11Iron" }, { 0x04, 0x005c, "SandWedge" }, { 0x04, 0x005d, "LoftWedge" }, { 0x04, 0x005e, "PowerWedge" }, { 0x04, 0x005f, "1Wood" }, { 0x04, 0x0060, "3Wood" }, { 0x04, 0x0061, "5Wood" }, { 0x04, 0x0062, "7Wood" }, { 0x04, 0x0063, "9Wood" }, { 0x05, 0, "GameControls" }, { 0x05, 0x0001, "3DGameController" }, { 0x05, 0x0002, "PinballDevice" }, { 0x05, 0x0003, "GunDevice" }, { 0x05, 0x0020, "PointofView" }, { 0x05, 0x0021, "TurnRightLeft" }, { 0x05, 0x0022, "PitchForwardBackward" }, { 0x05, 0x0023, "RollRightLeft" }, { 0x05, 0x0024, "MoveRightLeft" }, { 0x05, 0x0025, "MoveForwardBackward" }, { 0x05, 0x0026, "MoveUpDown" }, { 0x05, 0x0027, "LeanRightLeft" }, { 0x05, 0x0028, "LeanForwardBackward" }, { 0x05, 0x0029, "HeightofPOV" }, { 0x05, 0x002a, "Flipper" }, { 0x05, 0x002b, "SecondaryFlipper" }, { 0x05, 0x002c, "Bump" }, { 0x05, 0x002d, "NewGame" }, { 0x05, 0x002e, "ShootBall" }, { 0x05, 0x002f, "Player" }, { 0x05, 0x0030, "GunBolt" }, { 0x05, 0x0031, "GunClip" }, { 0x05, 0x0032, "GunSelector" }, { 0x05, 0x0033, "GunSingleShot" }, { 0x05, 0x0034, "GunBurst" }, { 0x05, 0x0035, "GunAutomatic" }, { 0x05, 0x0036, "GunSafety" }, { 0x05, 0x0037, "GamepadFireJump" }, { 0x05, 0x0039, "GamepadTrigger" }, { 0x05, 0x003a, "FormfittingGamepad" }, { 0x06, 0, "GenericDeviceControls" }, { 0x06, 0x0001, "BackgroundNonuserControls" }, { 0x06, 0x0020, "BatteryStrength" }, { 0x06, 0x0021, "WirelessChannel" }, { 0x06, 0x0022, "WirelessID" }, { 0x06, 0x0023, "DiscoverWirelessControl" }, { 0x06, 0x0024, "SecurityCodeCharacterEntered" }, { 0x06, 0x0025, "SecurityCodeCharacterErased" }, { 0x06, 0x0026, "SecurityCodeCleared" }, { 0x06, 0x0027, "SequenceID" }, { 0x06, 0x0028, "SequenceIDReset" }, { 0x06, 0x0029, "RFSignalStrength" }, { 0x06, 0x002a, "SoftwareVersion" }, { 0x06, 0x002b, "ProtocolVersion" }, { 0x06, 0x002c, "HardwareVersion" }, { 0x06, 0x002d, "Major" }, { 0x06, 0x002e, "Minor" }, { 0x06, 0x002f, "Revision" }, { 0x06, 0x0030, "Handedness" }, { 0x06, 0x0031, "EitherHand" }, { 0x06, 0x0032, "LeftHand" }, { 0x06, 0x0033, "RightHand" }, { 0x06, 0x0034, "BothHands" }, { 0x06, 0x0040, "GripPoseOffset" }, { 0x06, 0x0041, "PointerPoseOffset" }, { 0x07, 0, "KeyboardKeypad" }, { 0x07, 0x0001, "ErrorRollOver" }, { 0x07, 0x0002, "POSTFail" }, { 0x07, 0x0003, "ErrorUndefined" }, { 0x07, 0x0004, "KeyboardA" }, { 0x07, 0x0005, "KeyboardB" }, { 0x07, 0x0006, "KeyboardC" }, { 0x07, 0x0007, "KeyboardD" }, { 0x07, 0x0008, "KeyboardE" }, { 0x07, 0x0009, "KeyboardF" }, { 0x07, 0x000a, "KeyboardG" }, { 0x07, 0x000b, "KeyboardH" }, { 0x07, 0x000c, "KeyboardI" }, { 0x07, 0x000d, "KeyboardJ" }, { 0x07, 0x000e, "KeyboardK" }, { 0x07, 0x000f, "KeyboardL" }, { 0x07, 0x0010, "KeyboardM" }, { 0x07, 0x0011, "KeyboardN" }, { 0x07, 0x0012, "KeyboardO" }, { 0x07, 0x0013, "KeyboardP" }, { 0x07, 0x0014, "KeyboardQ" }, { 0x07, 0x0015, "KeyboardR" }, { 0x07, 0x0016, "KeyboardS" }, { 0x07, 0x0017, "KeyboardT" }, { 0x07, 0x0018, "KeyboardU" }, { 0x07, 0x0019, "KeyboardV" }, { 0x07, 0x001a, "KeyboardW" }, { 0x07, 0x001b, "KeyboardX" }, { 0x07, 0x001c, "KeyboardY" }, { 0x07, 0x001d, "KeyboardZ" }, { 0x07, 0x001e, "Keyboard1andBang" }, { 0x07, 0x001f, "Keyboard2andAt" }, { 0x07, 0x0020, "Keyboard3andHash" }, { 0x07, 0x0021, "Keyboard4andDollar" }, { 0x07, 0x0022, "Keyboard5andPercent" }, { 0x07, 0x0023, "Keyboard6andCaret" }, { 0x07, 0x0024, "Keyboard7andAmpersand" }, { 0x07, 0x0025, "Keyboard8andStar" }, { 0x07, 0x0026, "Keyboard9andLeftBracket" }, { 0x07, 0x0027, "Keyboard0andRightBracket" }, { 0x07, 0x0028, "KeyboardReturnEnter" }, { 0x07, 0x0029, "KeyboardEscape" }, { 0x07, 0x002a, "KeyboardDelete" }, { 0x07, 0x002b, "KeyboardTab" }, { 0x07, 0x002c, "KeyboardSpacebar" }, { 0x07, 0x002d, "KeyboardDashandUnderscore" }, { 0x07, 0x002e, "KeyboardEqualsandPlus" }, { 0x07, 0x002f, "KeyboardLeftBrace" }, { 0x07, 0x0030, "KeyboardRightBrace" }, { 0x07, 0x0031, "KeyboardBackslashandPipe" }, { 0x07, 0x0032, "KeyboardNonUSHashandTilde" }, { 0x07, 0x0033, "KeyboardSemiColonandColon" }, { 0x07, 0x0034, "KeyboardLeftAposandDouble" }, { 0x07, 0x0035, "KeyboardGraveAccentandTilde" }, { 0x07, 0x0036, "KeyboardCommaandLessThan" }, { 0x07, 0x0037, "KeyboardPeriodandGreaterThan" }, { 0x07, 0x0038, "KeyboardForwardSlashandQuestionMark" }, { 0x07, 0x0039, "KeyboardCapsLock" }, { 0x07, 0x003a, "KeyboardF1" }, { 0x07, 0x003b, "KeyboardF2" }, { 0x07, 0x003c, "KeyboardF3" }, { 0x07, 0x003d, "KeyboardF4" }, { 0x07, 0x003e, "KeyboardF5" }, { 0x07, 0x003f, "KeyboardF6" }, { 0x07, 0x0040, "KeyboardF7" }, { 0x07, 0x0041, "KeyboardF8" }, { 0x07, 0x0042, "KeyboardF9" }, { 0x07, 0x0043, "KeyboardF10" }, { 0x07, 0x0044, "KeyboardF11" }, { 0x07, 0x0045, "KeyboardF12" }, { 0x07, 0x0046, "KeyboardPrintScreen" }, { 0x07, 0x0047, "KeyboardScrollLock" }, { 0x07, 0x0048, "KeyboardPause" }, { 0x07, 0x0049, "KeyboardInsert" }, { 0x07, 0x004a, "KeyboardHome" }, { 0x07, 0x004b, "KeyboardPageUp" }, { 0x07, 0x004c, "KeyboardDeleteForward" }, { 0x07, 0x004d, "KeyboardEnd" }, { 0x07, 0x004e, "KeyboardPageDown" }, { 0x07, 0x004f, "KeyboardRightArrow" }, { 0x07, 0x0050, "KeyboardLeftArrow" }, { 0x07, 0x0051, "KeyboardDownArrow" }, { 0x07, 0x0052, "KeyboardUpArrow" }, { 0x07, 0x0053, "KeypadNumLockandClear" }, { 0x07, 0x0054, "KeypadForwardSlash" }, { 0x07, 0x0055, "KeypadStar" }, { 0x07, 0x0056, "KeypadDash" }, { 0x07, 0x0057, "KeypadPlus" }, { 0x07, 0x0058, "KeypadENTER" }, { 0x07, 0x0059, "Keypad1andEnd" }, { 0x07, 0x005a, "Keypad2andDownArrow" }, { 0x07, 0x005b, "Keypad3andPageDn" }, { 0x07, 0x005c, "Keypad4andLeftArrow" }, { 0x07, 0x005d, "Keypad5" }, { 0x07, 0x005e, "Keypad6andRightArrow" }, { 0x07, 0x005f, "Keypad7andHome" }, { 0x07, 0x0060, "Keypad8andUpArrow" }, { 0x07, 0x0061, "Keypad9andPageUp" }, { 0x07, 0x0062, "Keypad0andInsert" }, { 0x07, 0x0063, "KeypadPeriodandDelete" }, { 0x07, 0x0064, "KeyboardNonUSBackslashandPipe" }, { 0x07, 0x0065, "KeyboardApplication" }, { 0x07, 0x0066, "KeyboardPower" }, { 0x07, 0x0067, "KeypadEquals" }, { 0x07, 0x0068, "KeyboardF13" }, { 0x07, 0x0069, "KeyboardF14" }, { 0x07, 0x006a, "KeyboardF15" }, { 0x07, 0x006b, "KeyboardF16" }, { 0x07, 0x006c, "KeyboardF17" }, { 0x07, 0x006d, "KeyboardF18" }, { 0x07, 0x006e, "KeyboardF19" }, { 0x07, 0x006f, "KeyboardF20" }, { 0x07, 0x0070, "KeyboardF21" }, { 0x07, 0x0071, "KeyboardF22" }, { 0x07, 0x0072, "KeyboardF23" }, { 0x07, 0x0073, "KeyboardF24" }, { 0x07, 0x0074, "KeyboardExecute" }, { 0x07, 0x0075, "KeyboardHelp" }, { 0x07, 0x0076, "KeyboardMenu" }, { 0x07, 0x0077, "KeyboardSelect" }, { 0x07, 0x0078, "KeyboardStop" }, { 0x07, 0x0079, "KeyboardAgain" }, { 0x07, 0x007a, "KeyboardUndo" }, { 0x07, 0x007b, "KeyboardCut" }, { 0x07, 0x007c, "KeyboardCopy" }, { 0x07, 0x007d, "KeyboardPaste" }, { 0x07, 0x007e, "KeyboardFind" }, { 0x07, 0x007f, "KeyboardMute" }, { 0x07, 0x0080, "KeyboardVolumeUp" }, { 0x07, 0x0081, "KeyboardVolumeDown" }, { 0x07, 0x0082, "KeyboardLockingCapsLock" }, { 0x07, 0x0083, "KeyboardLockingNumLock" }, { 0x07, 0x0084, "KeyboardLockingScrollLock" }, { 0x07, 0x0085, "KeypadComma" }, { 0x07, 0x0086, "KeypadEqualSign" }, { 0x07, 0x0087, "KeyboardInternational1" }, { 0x07, 0x0088, "KeyboardInternational2" }, { 0x07, 0x0089, "KeyboardInternational3" }, { 0x07, 0x008a, "KeyboardInternational4" }, { 0x07, 0x008b, "KeyboardInternational5" }, { 0x07, 0x008c, "KeyboardInternational6" }, { 0x07, 0x008d, "KeyboardInternational7" }, { 0x07, 0x008e, "KeyboardInternational8" }, { 0x07, 0x008f, "KeyboardInternational9" }, { 0x07, 0x0090, "KeyboardLANG1" }, { 0x07, 0x0091, "KeyboardLANG2" }, { 0x07, 0x0092, "KeyboardLANG3" }, { 0x07, 0x0093, "KeyboardLANG4" }, { 0x07, 0x0094, "KeyboardLANG5" }, { 0x07, 0x0095, "KeyboardLANG6" }, { 0x07, 0x0096, "KeyboardLANG7" }, { 0x07, 0x0097, "KeyboardLANG8" }, { 0x07, 0x0098, "KeyboardLANG9" }, { 0x07, 0x0099, "KeyboardAlternateErase" }, { 0x07, 0x009a, "KeyboardSysReqAttention" }, { 0x07, 0x009b, "KeyboardCancel" }, { 0x07, 0x009c, "KeyboardClear" }, { 0x07, 0x009d, "KeyboardPrior" }, { 0x07, 0x009e, "KeyboardReturn" }, { 0x07, 0x009f, "KeyboardSeparator" }, { 0x07, 0x00a0, "KeyboardOut" }, { 0x07, 0x00a1, "KeyboardOper" }, { 0x07, 0x00a2, "KeyboardClearAgain" }, { 0x07, 0x00a3, "KeyboardCrSelProps" }, { 0x07, 0x00a4, "KeyboardExSel" }, { 0x07, 0x00b0, "KeypadDouble0" }, { 0x07, 0x00b1, "KeypadTriple0" }, { 0x07, 0x00b2, "ThousandsSeparator" }, { 0x07, 0x00b3, "DecimalSeparator" }, { 0x07, 0x00b4, "CurrencyUnit" }, { 0x07, 0x00b5, "CurrencySubunit" }, { 0x07, 0x00b6, "KeypadLeftBracket" }, { 0x07, 0x00b7, "KeypadRightBracket" }, { 0x07, 0x00b8, "KeypadLeftBrace" }, { 0x07, 0x00b9, "KeypadRightBrace" }, { 0x07, 0x00ba, "KeypadTab" }, { 0x07, 0x00bb, "KeypadBackspace" }, { 0x07, 0x00bc, "KeypadA" }, { 0x07, 0x00bd, "KeypadB" }, { 0x07, 0x00be, "KeypadC" }, { 0x07, 0x00bf, "KeypadD" }, { 0x07, 0x00c0, "KeypadE" }, { 0x07, 0x00c1, "KeypadF" }, { 0x07, 0x00c2, "KeypadXOR" }, { 0x07, 0x00c3, "KeypadCaret" }, { 0x07, 0x00c4, "KeypadPercentage" }, { 0x07, 0x00c5, "KeypadLess" }, { 0x07, 0x00c6, "KeypadGreater" }, { 0x07, 0x00c7, "KeypadAmpersand" }, { 0x07, 0x00c8, "KeypadDoubleAmpersand" }, { 0x07, 0x00c9, "KeypadBar" }, { 0x07, 0x00ca, "KeypadDoubleBar" }, { 0x07, 0x00cb, "KeypadColon" }, { 0x07, 0x00cc, "KeypadHash" }, { 0x07, 0x00cd, "KeypadSpace" }, { 0x07, 0x00ce, "KeypadAt" }, { 0x07, 0x00cf, "KeypadBang" }, { 0x07, 0x00d0, "KeypadMemoryStore" }, { 0x07, 0x00d1, "KeypadMemoryRecall" }, { 0x07, 0x00d2, "KeypadMemoryClear" }, { 0x07, 0x00d3, "KeypadMemoryAdd" }, { 0x07, 0x00d4, "KeypadMemorySubtract" }, { 0x07, 0x00d5, "KeypadMemoryMultiply" }, { 0x07, 0x00d6, "KeypadMemoryDivide" }, { 0x07, 0x00d7, "KeypadPlusMinus" }, { 0x07, 0x00d8, "KeypadClear" }, { 0x07, 0x00d9, "KeypadClearEntry" }, { 0x07, 0x00da, "KeypadBinary" }, { 0x07, 0x00db, "KeypadOctal" }, { 0x07, 0x00dc, "KeypadDecimal" }, { 0x07, 0x00dd, "KeypadHexadecimal" }, { 0x07, 0x00e0, "KeyboardLeftControl" }, { 0x07, 0x00e1, "KeyboardLeftShift" }, { 0x07, 0x00e2, "KeyboardLeftAlt" }, { 0x07, 0x00e3, "KeyboardLeftGUI" }, { 0x07, 0x00e4, "KeyboardRightControl" }, { 0x07, 0x00e5, "KeyboardRightShift" }, { 0x07, 0x00e6, "KeyboardRightAlt" }, { 0x07, 0x00e7, "KeyboardRightGUI" }, { 0x08, 0, "LED" }, { 0x08, 0x0001, "NumLock" }, { 0x08, 0x0002, "CapsLock" }, { 0x08, 0x0003, "ScrollLock" }, { 0x08, 0x0004, "Compose" }, { 0x08, 0x0005, "Kana" }, { 0x08, 0x0006, "Power" }, { 0x08, 0x0007, "Shift" }, { 0x08, 0x0008, "DoNotDisturb" }, { 0x08, 0x0009, "Mute" }, { 0x08, 0x000a, "ToneEnable" }, { 0x08, 0x000b, "HighCutFilter" }, { 0x08, 0x000c, "LowCutFilter" }, { 0x08, 0x000d, "EqualizerEnable" }, { 0x08, 0x000e, "SoundFieldOn" }, { 0x08, 0x000f, "SurroundOn" }, { 0x08, 0x0010, "Repeat" }, { 0x08, 0x0011, "Stereo" }, { 0x08, 0x0012, "SamplingRateDetect" }, { 0x08, 0x0013, "Spinning" }, { 0x08, 0x0014, "CAV" }, { 0x08, 0x0015, "CLV" }, { 0x08, 0x0016, "RecordingFormatDetect" }, { 0x08, 0x0017, "OffHook" }, { 0x08, 0x0018, "Ring" }, { 0x08, 0x0019, "MessageWaiting" }, { 0x08, 0x001a, "DataMode" }, { 0x08, 0x001b, "BatteryOperation" }, { 0x08, 0x001c, "BatteryOK" }, { 0x08, 0x001d, "BatteryLow" }, { 0x08, 0x001e, "Speaker" }, { 0x08, 0x001f, "Headset" }, { 0x08, 0x0020, "Hold" }, { 0x08, 0x0021, "Microphone" }, { 0x08, 0x0022, "Coverage" }, { 0x08, 0x0023, "NightMode" }, { 0x08, 0x0024, "SendCalls" }, { 0x08, 0x0025, "CallPickup" }, { 0x08, 0x0026, "Conference" }, { 0x08, 0x0027, "Standby" }, { 0x08, 0x0028, "CameraOn" }, { 0x08, 0x0029, "CameraOff" }, { 0x08, 0x002a, "OnLine" }, { 0x08, 0x002b, "OffLine" }, { 0x08, 0x002c, "Busy" }, { 0x08, 0x002d, "Ready" }, { 0x08, 0x002e, "PaperOut" }, { 0x08, 0x002f, "PaperJam" }, { 0x08, 0x0030, "Remote" }, { 0x08, 0x0031, "Forward" }, { 0x08, 0x0032, "Reverse" }, { 0x08, 0x0033, "Stop" }, { 0x08, 0x0034, "Rewind" }, { 0x08, 0x0035, "FastForward" }, { 0x08, 0x0036, "Play" }, { 0x08, 0x0037, "Pause" }, { 0x08, 0x0038, "Record" }, { 0x08, 0x0039, "Error" }, { 0x08, 0x003a, "UsageSelectedIndicator" }, { 0x08, 0x003b, "UsageInUseIndicator" }, { 0x08, 0x003c, "UsageMultiModeIndicator" }, { 0x08, 0x003d, "IndicatorOn" }, { 0x08, 0x003e, "IndicatorFlash" }, { 0x08, 0x003f, "IndicatorSlowBlink" }, { 0x08, 0x0040, "IndicatorFastBlink" }, { 0x08, 0x0041, "IndicatorOff" }, { 0x08, 0x0042, "FlashOnTime" }, { 0x08, 0x0043, "SlowBlinkOnTime" }, { 0x08, 0x0044, "SlowBlinkOffTime" }, { 0x08, 0x0045, "FastBlinkOnTime" }, { 0x08, 0x0046, "FastBlinkOffTime" }, { 0x08, 0x0047, "UsageIndicatorColor" }, { 0x08, 0x0048, "IndicatorRed" }, { 0x08, 0x0049, "IndicatorGreen" }, { 0x08, 0x004a, "IndicatorAmber" }, { 0x08, 0x004b, "GenericIndicator" }, { 0x08, 0x004c, "SystemSuspend" }, { 0x08, 0x004d, "ExternalPowerConnected" }, { 0x08, 0x004e, "IndicatorBlue" }, { 0x08, 0x004f, "IndicatorOrange" }, { 0x08, 0x0050, "GoodStatus" }, { 0x08, 0x0051, "WarningStatus" }, { 0x08, 0x0052, "RGBLED" }, { 0x08, 0x0053, "RedLEDChannel" }, { 0x08, 0x0054, "BlueLEDChannel" }, { 0x08, 0x0055, "GreenLEDChannel" }, { 0x08, 0x0056, "LEDIntensity" }, { 0x08, 0x0057, "SystemMicrophoneMute" }, { 0x08, 0x0060, "PlayerIndicator" }, { 0x08, 0x0061, "Player1" }, { 0x08, 0x0062, "Player2" }, { 0x08, 0x0063, "Player3" }, { 0x08, 0x0064, "Player4" }, { 0x08, 0x0065, "Player5" }, { 0x08, 0x0066, "Player6" }, { 0x08, 0x0067, "Player7" }, { 0x08, 0x0068, "Player8" }, { 0x09, 0, "Button" }, { 0x0a, 0, "Ordinal" }, { 0x0b, 0, "TelephonyDevice" }, { 0x0b, 0x0001, "Phone" }, { 0x0b, 0x0002, "AnsweringMachine" }, { 0x0b, 0x0003, "MessageControls" }, { 0x0b, 0x0004, "Handset" }, { 0x0b, 0x0005, "Headset" }, { 0x0b, 0x0006, "TelephonyKeyPad" }, { 0x0b, 0x0007, "ProgrammableButton" }, { 0x0b, 0x0020, "HookSwitch" }, { 0x0b, 0x0021, "Flash" }, { 0x0b, 0x0022, "Feature" }, { 0x0b, 0x0023, "Hold" }, { 0x0b, 0x0024, "Redial" }, { 0x0b, 0x0025, "Transfer" }, { 0x0b, 0x0026, "Drop" }, { 0x0b, 0x0027, "Park" }, { 0x0b, 0x0028, "ForwardCalls" }, { 0x0b, 0x0029, "AlternateFunction" }, { 0x0b, 0x002a, "Line" }, { 0x0b, 0x002b, "SpeakerPhone" }, { 0x0b, 0x002c, "Conference" }, { 0x0b, 0x002d, "RingEnable" }, { 0x0b, 0x002e, "RingSelect" }, { 0x0b, 0x002f, "PhoneMute" }, { 0x0b, 0x0030, "CallerID" }, { 0x0b, 0x0031, "Send" }, { 0x0b, 0x0050, "SpeedDial" }, { 0x0b, 0x0051, "StoreNumber" }, { 0x0b, 0x0052, "RecallNumber" }, { 0x0b, 0x0053, "PhoneDirectory" }, { 0x0b, 0x0070, "VoiceMail" }, { 0x0b, 0x0071, "ScreenCalls" }, { 0x0b, 0x0072, "DoNotDisturb" }, { 0x0b, 0x0073, "Message" }, { 0x0b, 0x0074, "AnswerOnOff" }, { 0x0b, 0x0090, "InsideDialTone" }, { 0x0b, 0x0091, "OutsideDialTone" }, { 0x0b, 0x0092, "InsideRingTone" }, { 0x0b, 0x0093, "OutsideRingTone" }, { 0x0b, 0x0094, "PriorityRingTone" }, { 0x0b, 0x0095, "InsideRingback" }, { 0x0b, 0x0096, "PriorityRingback" }, { 0x0b, 0x0097, "LineBusyTone" }, { 0x0b, 0x0098, "ReorderTone" }, { 0x0b, 0x0099, "CallWaitingTone" }, { 0x0b, 0x009a, "ConfirmationTone1" }, { 0x0b, 0x009b, "ConfirmationTone2" }, { 0x0b, 0x009c, "TonesOff" }, { 0x0b, 0x009d, "OutsideRingback" }, { 0x0b, 0x009e, "Ringer" }, { 0x0b, 0x00b0, "PhoneKey0" }, { 0x0b, 0x00b1, "PhoneKey1" }, { 0x0b, 0x00b2, "PhoneKey2" }, { 0x0b, 0x00b3, "PhoneKey3" }, { 0x0b, 0x00b4, "PhoneKey4" }, { 0x0b, 0x00b5, "PhoneKey5" }, { 0x0b, 0x00b6, "PhoneKey6" }, { 0x0b, 0x00b7, "PhoneKey7" }, { 0x0b, 0x00b8, "PhoneKey8" }, { 0x0b, 0x00b9, "PhoneKey9" }, { 0x0b, 0x00ba, "PhoneKeyStar" }, { 0x0b, 0x00bb, "PhoneKeyPound" }, { 0x0b, 0x00bc, "PhoneKeyA" }, { 0x0b, 0x00bd, "PhoneKeyB" }, { 0x0b, 0x00be, "PhoneKeyC" }, { 0x0b, 0x00bf, "PhoneKeyD" }, { 0x0b, 0x00c0, "PhoneCallHistoryKey" }, { 0x0b, 0x00c1, "PhoneCallerIDKey" }, { 0x0b, 0x00c2, "PhoneSettingsKey" }, { 0x0b, 0x00f0, "HostControl" }, { 0x0b, 0x00f1, "HostAvailable" }, { 0x0b, 0x00f2, "HostCallActive" }, { 0x0b, 0x00f3, "ActivateHandsetAudio" }, { 0x0b, 0x00f4, "RingType" }, { 0x0b, 0x00f5, "RedialablePhoneNumber" }, { 0x0b, 0x00f8, "StopRingTone" }, { 0x0b, 0x00f9, "PSTNRingTone" }, { 0x0b, 0x00fa, "HostRingTone" }, { 0x0b, 0x00fb, "AlertSoundError" }, { 0x0b, 0x00fc, "AlertSoundConfirm" }, { 0x0b, 0x00fd, "AlertSoundNotification" }, { 0x0b, 0x00fe, "SilentRing" }, { 0x0b, 0x0108, "EmailMessageWaiting" }, { 0x0b, 0x0109, "VoicemailMessageWaiting" }, { 0x0b, 0x010a, "HostHold" }, { 0x0b, 0x0110, "IncomingCallHistoryCount" }, { 0x0b, 0x0111, "OutgoingCallHistoryCount" }, { 0x0b, 0x0112, "IncomingCallHistory" }, { 0x0b, 0x0113, "OutgoingCallHistory" }, { 0x0b, 0x0114, "PhoneLocale" }, { 0x0b, 0x0140, "PhoneTimeSecond" }, { 0x0b, 0x0141, "PhoneTimeMinute" }, { 0x0b, 0x0142, "PhoneTimeHour" }, { 0x0b, 0x0143, "PhoneDateDay" }, { 0x0b, 0x0144, "PhoneDateMonth" }, { 0x0b, 0x0145, "PhoneDateYear" }, { 0x0b, 0x0146, "HandsetNickname" }, { 0x0b, 0x0147, "AddressBookID" }, { 0x0b, 0x014a, "CallDuration" }, { 0x0b, 0x014b, "DualModePhone" }, { 0x0c, 0, "Consumer" }, { 0x0c, 0x0001, "ConsumerControl" }, { 0x0c, 0x0002, "NumericKeyPad" }, { 0x0c, 0x0003, "ProgrammableButtons" }, { 0x0c, 0x0004, "Microphone" }, { 0x0c, 0x0005, "Headphone" }, { 0x0c, 0x0006, "GraphicEqualizer" }, { 0x0c, 0x0020, "10" }, { 0x0c, 0x0021, "100" }, { 0x0c, 0x0022, "AMPM" }, { 0x0c, 0x0030, "Power" }, { 0x0c, 0x0031, "Reset" }, { 0x0c, 0x0032, "Sleep" }, { 0x0c, 0x0033, "SleepAfter" }, { 0x0c, 0x0034, "SleepMode" }, { 0x0c, 0x0035, "Illumination" }, { 0x0c, 0x0036, "FunctionButtons" }, { 0x0c, 0x0040, "Menu" }, { 0x0c, 0x0041, "MenuPick" }, { 0x0c, 0x0042, "MenuUp" }, { 0x0c, 0x0043, "MenuDown" }, { 0x0c, 0x0044, "MenuLeft" }, { 0x0c, 0x0045, "MenuRight" }, { 0x0c, 0x0046, "MenuEscape" }, { 0x0c, 0x0047, "MenuValueIncrease" }, { 0x0c, 0x0048, "MenuValueDecrease" }, { 0x0c, 0x0060, "DataOnScreen" }, { 0x0c, 0x0061, "ClosedCaption" }, { 0x0c, 0x0062, "ClosedCaptionSelect" }, { 0x0c, 0x0063, "VCRTV" }, { 0x0c, 0x0064, "BroadcastMode" }, { 0x0c, 0x0065, "Snapshot" }, { 0x0c, 0x0066, "Still" }, { 0x0c, 0x0067, "PictureinPictureToggle" }, { 0x0c, 0x0068, "PictureinPictureSwap" }, { 0x0c, 0x0069, "RedMenuButton" }, { 0x0c, 0x006a, "GreenMenuButton" }, { 0x0c, 0x006b, "BlueMenuButton" }, { 0x0c, 0x006c, "YellowMenuButton" }, { 0x0c, 0x006d, "Aspect" }, { 0x0c, 0x006e, "3DModeSelect" }, { 0x0c, 0x006f, "DisplayBrightnessIncrement" }, { 0x0c, 0x0070, "DisplayBrightnessDecrement" }, { 0x0c, 0x0071, "DisplayBrightness" }, { 0x0c, 0x0072, "DisplayBacklightToggle" }, { 0x0c, 0x0073, "DisplaySetBrightnesstoMinimum" }, { 0x0c, 0x0074, "DisplaySetBrightnesstoMaximum" }, { 0x0c, 0x0075, "DisplaySetAutoBrightness" }, { 0x0c, 0x0076, "CameraAccessEnabled" }, { 0x0c, 0x0077, "CameraAccessDisabled" }, { 0x0c, 0x0078, "CameraAccessToggle" }, { 0x0c, 0x0079, "KeyboardBrightnessIncrement" }, { 0x0c, 0x007a, "KeyboardBrightnessDecrement" }, { 0x0c, 0x007b, "KeyboardBacklightSetLevel" }, { 0x0c, 0x007c, "KeyboardBacklightOOC" }, { 0x0c, 0x007d, "KeyboardBacklightSetMinimum" }, { 0x0c, 0x007e, "KeyboardBacklightSetMaximum" }, { 0x0c, 0x007f, "KeyboardBacklightAuto" }, { 0x0c, 0x0080, "Selection" }, { 0x0c, 0x0081, "AssignSelection" }, { 0x0c, 0x0082, "ModeStep" }, { 0x0c, 0x0083, "RecallLast" }, { 0x0c, 0x0084, "EnterChannel" }, { 0x0c, 0x0085, "OrderMovie" }, { 0x0c, 0x0086, "Channel" }, { 0x0c, 0x0087, "MediaSelection" }, { 0x0c, 0x0088, "MediaSelectComputer" }, { 0x0c, 0x0089, "MediaSelectTV" }, { 0x0c, 0x008a, "MediaSelectWWW" }, { 0x0c, 0x008b, "MediaSelectDVD" }, { 0x0c, 0x008c, "MediaSelectTelephone" }, { 0x0c, 0x008d, "MediaSelectProgramGuide" }, { 0x0c, 0x008e, "MediaSelectVideoPhone" }, { 0x0c, 0x008f, "MediaSelectGames" }, { 0x0c, 0x0090, "MediaSelectMessages" }, { 0x0c, 0x0091, "MediaSelectCD" }, { 0x0c, 0x0092, "MediaSelectVCR" }, { 0x0c, 0x0093, "MediaSelectTuner" }, { 0x0c, 0x0094, "Quit" }, { 0x0c, 0x0095, "Help" }, { 0x0c, 0x0096, "MediaSelectTape" }, { 0x0c, 0x0097, "MediaSelectCable" }, { 0x0c, 0x0098, "MediaSelectSatellite" }, { 0x0c, 0x0099, "MediaSelectSecurity" }, { 0x0c, 0x009a, "MediaSelectHome" }, { 0x0c, 0x009b, "MediaSelectCall" }, { 0x0c, 0x009c, "ChannelIncrement" }, { 0x0c, 0x009d, "ChannelDecrement" }, { 0x0c, 0x009e, "MediaSelectSAP" }, { 0x0c, 0x00a0, "VCRPlus" }, { 0x0c, 0x00a1, "Once" }, { 0x0c, 0x00a2, "Daily" }, { 0x0c, 0x00a3, "Weekly" }, { 0x0c, 0x00a4, "Monthly" }, { 0x0c, 0x00b0, "Play" }, { 0x0c, 0x00b1, "Pause" }, { 0x0c, 0x00b2, "Record" }, { 0x0c, 0x00b3, "FastForward" }, { 0x0c, 0x00b4, "Rewind" }, { 0x0c, 0x00b5, "ScanNextTrack" }, { 0x0c, 0x00b6, "ScanPreviousTrack" }, { 0x0c, 0x00b7, "Stop" }, { 0x0c, 0x00b8, "Eject" }, { 0x0c, 0x00b9, "RandomPlay" }, { 0x0c, 0x00ba, "SelectDisc" }, { 0x0c, 0x00bb, "EnterDisc" }, { 0x0c, 0x00bc, "Repeat" }, { 0x0c, 0x00bd, "Tracking" }, { 0x0c, 0x00be, "TrackNormal" }, { 0x0c, 0x00bf, "SlowTracking" }, { 0x0c, 0x00c0, "FrameForward" }, { 0x0c, 0x00c1, "FrameBack" }, { 0x0c, 0x00c2, "Mark" }, { 0x0c, 0x00c3, "ClearMark" }, { 0x0c, 0x00c4, "RepeatFromMark" }, { 0x0c, 0x00c5, "ReturnToMark" }, { 0x0c, 0x00c6, "SearchMarkForward" }, { 0x0c, 0x00c7, "SearchMarkBackwards" }, { 0x0c, 0x00c8, "CounterReset" }, { 0x0c, 0x00c9, "ShowCounter" }, { 0x0c, 0x00ca, "TrackingIncrement" }, { 0x0c, 0x00cb, "TrackingDecrement" }, { 0x0c, 0x00cc, "StopEject" }, { 0x0c, 0x00cd, "PlayPause" }, { 0x0c, 0x00ce, "PlaySkip" }, { 0x0c, 0x00cf, "VoiceCommand" }, { 0x0c, 0x00d0, "InvokeCaptureInterface" }, { 0x0c, 0x00d1, "StartorStopGameRecording" }, { 0x0c, 0x00d2, "HistoricalGameCapture" }, { 0x0c, 0x00d3, "CaptureGameScreenshot" }, { 0x0c, 0x00d4, "ShoworHideRecordingIndicator" }, { 0x0c, 0x00d5, "StartorStopMicrophoneCapture" }, { 0x0c, 0x00d6, "StartorStopCameraCapture" }, { 0x0c, 0x00d7, "StartorStopGameBroadcast" }, { 0x0c, 0x00d8, "StartorStopVoiceDictationSession" }, { 0x0c, 0x00d9, "InvokeDismissEmojiPicker" }, { 0x0c, 0x00e0, "Volume" }, { 0x0c, 0x00e1, "Balance" }, { 0x0c, 0x00e2, "Mute" }, { 0x0c, 0x00e3, "Bass" }, { 0x0c, 0x00e4, "Treble" }, { 0x0c, 0x00e5, "BassBoost" }, { 0x0c, 0x00e6, "SurroundMode" }, { 0x0c, 0x00e7, "Loudness" }, { 0x0c, 0x00e8, "MPX" }, { 0x0c, 0x00e9, "VolumeIncrement" }, { 0x0c, 0x00ea, "VolumeDecrement" }, { 0x0c, 0x00f0, "SpeedSelect" }, { 0x0c, 0x00f1, "PlaybackSpeed" }, { 0x0c, 0x00f2, "StandardPlay" }, { 0x0c, 0x00f3, "LongPlay" }, { 0x0c, 0x00f4, "ExtendedPlay" }, { 0x0c, 0x00f5, "Slow" }, { 0x0c, 0x0100, "FanEnable" }, { 0x0c, 0x0101, "FanSpeed" }, { 0x0c, 0x0102, "LightEnable" }, { 0x0c, 0x0103, "LightIlluminationLevel" }, { 0x0c, 0x0104, "ClimateControlEnable" }, { 0x0c, 0x0105, "RoomTemperature" }, { 0x0c, 0x0106, "SecurityEnable" }, { 0x0c, 0x0107, "FireAlarm" }, { 0x0c, 0x0108, "PoliceAlarm" }, { 0x0c, 0x0109, "Proximity" }, { 0x0c, 0x010a, "Motion" }, { 0x0c, 0x010b, "DuressAlarm" }, { 0x0c, 0x010c, "HoldupAlarm" }, { 0x0c, 0x010d, "MedicalAlarm" }, { 0x0c, 0x0150, "BalanceRight" }, { 0x0c, 0x0151, "BalanceLeft" }, { 0x0c, 0x0152, "BassIncrement" }, { 0x0c, 0x0153, "BassDecrement" }, { 0x0c, 0x0154, "TrebleIncrement" }, { 0x0c, 0x0155, "TrebleDecrement" }, { 0x0c, 0x0160, "SpeakerSystem" }, { 0x0c, 0x0161, "ChannelLeft" }, { 0x0c, 0x0162, "ChannelRight" }, { 0x0c, 0x0163, "ChannelCenter" }, { 0x0c, 0x0164, "ChannelFront" }, { 0x0c, 0x0165, "ChannelCenterFront" }, { 0x0c, 0x0166, "ChannelSide" }, { 0x0c, 0x0167, "ChannelSurround" }, { 0x0c, 0x0168, "ChannelLowFrequencyEnhancement" }, { 0x0c, 0x0169, "ChannelTop" }, { 0x0c, 0x016a, "ChannelUnknown" }, { 0x0c, 0x0170, "Subchannel" }, { 0x0c, 0x0171, "SubchannelIncrement" }, { 0x0c, 0x0172, "SubchannelDecrement" }, { 0x0c, 0x0173, "AlternateAudioIncrement" }, { 0x0c, 0x0174, "AlternateAudioDecrement" }, { 0x0c, 0x0180, "ApplicationLaunchButtons" }, { 0x0c, 0x0181, "ALLaunchButtonConfigurationTool" }, { 0x0c, 0x0182, "ALProgrammableButtonConfiguration" }, { 0x0c, 0x0183, "ALConsumerControlConfiguration" }, { 0x0c, 0x0184, "ALWordProcessor" }, { 0x0c, 0x0185, "ALTextEditor" }, { 0x0c, 0x0186, "ALSpreadsheet" }, { 0x0c, 0x0187, "ALGraphicsEditor" }, { 0x0c, 0x0188, "ALPresentationApp" }, { 0x0c, 0x0189, "ALDatabaseApp" }, { 0x0c, 0x018a, "ALEmailReader" }, { 0x0c, 0x018b, "ALNewsreader" }, { 0x0c, 0x018c, "ALVoicemail" }, { 0x0c, 0x018d, "ALContactsAddressBook" }, { 0x0c, 0x018e, "ALCalendarSchedule" }, { 0x0c, 0x018f, "ALTaskProjectManager" }, { 0x0c, 0x0190, "ALLogJournalTimecard" }, { 0x0c, 0x0191, "ALCheckbookFinance" }, { 0x0c, 0x0192, "ALCalculator" }, { 0x0c, 0x0193, "ALAVCapturePlayback" }, { 0x0c, 0x0194, "ALLocalMachineBrowser" }, { 0x0c, 0x0195, "ALLANWANBrowser" }, { 0x0c, 0x0196, "ALInternetBrowser" }, { 0x0c, 0x0197, "ALRemoteNetworkingISPConnect" }, { 0x0c, 0x0198, "ALNetworkConference" }, { 0x0c, 0x0199, "ALNetworkChat" }, { 0x0c, 0x019a, "ALTelephonyDialer" }, { 0x0c, 0x019b, "ALLogon" }, { 0x0c, 0x019c, "ALLogoff" }, { 0x0c, 0x019d, "ALLogonLogoff" }, { 0x0c, 0x019e, "ALTerminalLockScreensaver" }, { 0x0c, 0x019f, "ALControlPanel" }, { 0x0c, 0x01a0, "ALCommandLineProcessorRun" }, { 0x0c, 0x01a1, "ALProcessTaskManager" }, { 0x0c, 0x01a2, "ALSelectTaskApplication" }, { 0x0c, 0x01a3, "ALNextTaskApplication" }, { 0x0c, 0x01a4, "ALPreviousTaskApplication" }, { 0x0c, 0x01a5, "ALPreemptiveHaltTaskApplication" }, { 0x0c, 0x01a6, "ALIntegratedHelpCenter" }, { 0x0c, 0x01a7, "ALDocuments" }, { 0x0c, 0x01a8, "ALThesaurus" }, { 0x0c, 0x01a9, "ALDictionary" }, { 0x0c, 0x01aa, "ALDesktop" }, { 0x0c, 0x01ab, "ALSpellCheck" }, { 0x0c, 0x01ac, "ALGrammarCheck" }, { 0x0c, 0x01ad, "ALWirelessStatus" }, { 0x0c, 0x01ae, "ALKeyboardLayout" }, { 0x0c, 0x01af, "ALVirusProtection" }, { 0x0c, 0x01b0, "ALEncryption" }, { 0x0c, 0x01b1, "ALScreenSaver" }, { 0x0c, 0x01b2, "ALAlarms" }, { 0x0c, 0x01b3, "ALClock" }, { 0x0c, 0x01b4, "ALFileBrowser" }, { 0x0c, 0x01b5, "ALPowerStatus" }, { 0x0c, 0x01b6, "ALImageBrowser" }, { 0x0c, 0x01b7, "ALAudioBrowser" }, { 0x0c, 0x01b8, "ALMovieBrowser" }, { 0x0c, 0x01b9, "ALDigitalRightsManager" }, { 0x0c, 0x01ba, "ALDigitalWallet" }, { 0x0c, 0x01bc, "ALInstantMessaging" }, { 0x0c, 0x01bd, "ALOEMFeaturesTipsTutorialBrowser" }, { 0x0c, 0x01be, "ALOEMHelp" }, { 0x0c, 0x01bf, "ALOnlineCommunity" }, { 0x0c, 0x01c0, "ALEntertainmentContentBrowser" }, { 0x0c, 0x01c1, "ALOnlineShoppingBrowser" }, { 0x0c, 0x01c2, "ALSmartCardInformationHelp" }, { 0x0c, 0x01c3, "ALMarketMonitorFinanceBrowser" }, { 0x0c, 0x01c4, "ALCustomizedCorporateNewsBrowser" }, { 0x0c, 0x01c5, "ALOnlineActivityBrowser" }, { 0x0c, 0x01c6, "ALResearchSearchBrowser" }, { 0x0c, 0x01c7, "ALAudioPlayer" }, { 0x0c, 0x01c8, "ALMessageStatus" }, { 0x0c, 0x01c9, "ALContactSync" }, { 0x0c, 0x01ca, "ALNavigation" }, { 0x0c, 0x01cb, "ALContextawareDesktopAssistant" }, { 0x0c, 0x0200, "GenericGUIApplicationControls" }, { 0x0c, 0x0201, "ACNew" }, { 0x0c, 0x0202, "ACOpen" }, { 0x0c, 0x0203, "ACClose" }, { 0x0c, 0x0204, "ACExit" }, { 0x0c, 0x0205, "ACMaximize" }, { 0x0c, 0x0206, "ACMinimize" }, { 0x0c, 0x0207, "ACSave" }, { 0x0c, 0x0208, "ACPrint" }, { 0x0c, 0x0209, "ACProperties" }, { 0x0c, 0x021a, "ACUndo" }, { 0x0c, 0x021b, "ACCopy" }, { 0x0c, 0x021c, "ACCut" }, { 0x0c, 0x021d, "ACPaste" }, { 0x0c, 0x021e, "ACSelectAll" }, { 0x0c, 0x021f, "ACFind" }, { 0x0c, 0x0220, "ACFindandReplace" }, { 0x0c, 0x0221, "ACSearch" }, { 0x0c, 0x0222, "ACGoTo" }, { 0x0c, 0x0223, "ACHome" }, { 0x0c, 0x0224, "ACBack" }, { 0x0c, 0x0225, "ACForward" }, { 0x0c, 0x0226, "ACStop" }, { 0x0c, 0x0227, "ACRefresh" }, { 0x0c, 0x0228, "ACPreviousLink" }, { 0x0c, 0x0229, "ACNextLink" }, { 0x0c, 0x022a, "ACBookmarks" }, { 0x0c, 0x022b, "ACHistory" }, { 0x0c, 0x022c, "ACSubscriptions" }, { 0x0c, 0x022d, "ACZoomIn" }, { 0x0c, 0x022e, "ACZoomOut" }, { 0x0c, 0x022f, "ACZoom" }, { 0x0c, 0x0230, "ACFullScreenView" }, { 0x0c, 0x0231, "ACNormalView" }, { 0x0c, 0x0232, "ACViewToggle" }, { 0x0c, 0x0233, "ACScrollUp" }, { 0x0c, 0x0234, "ACScrollDown" }, { 0x0c, 0x0235, "ACScroll" }, { 0x0c, 0x0236, "ACPanLeft" }, { 0x0c, 0x0237, "ACPanRight" }, { 0x0c, 0x0238, "ACPan" }, { 0x0c, 0x0239, "ACNewWindow" }, { 0x0c, 0x023a, "ACTileHorizontally" }, { 0x0c, 0x023b, "ACTileVertically" }, { 0x0c, 0x023c, "ACFormat" }, { 0x0c, 0x023d, "ACEdit" }, { 0x0c, 0x023e, "ACBold" }, { 0x0c, 0x023f, "ACItalics" }, { 0x0c, 0x0240, "ACUnderline" }, { 0x0c, 0x0241, "ACStrikethrough" }, { 0x0c, 0x0242, "ACSubscript" }, { 0x0c, 0x0243, "ACSuperscript" }, { 0x0c, 0x0244, "ACAllCaps" }, { 0x0c, 0x0245, "ACRotate" }, { 0x0c, 0x0246, "ACResize" }, { 0x0c, 0x0247, "ACFlipHorizontal" }, { 0x0c, 0x0248, "ACFlipVertical" }, { 0x0c, 0x0249, "ACMirrorHorizontal" }, { 0x0c, 0x024a, "ACMirrorVertical" }, { 0x0c, 0x024b, "ACFontSelect" }, { 0x0c, 0x024c, "ACFontColor" }, { 0x0c, 0x024d, "ACFontSize" }, { 0x0c, 0x024e, "ACJustifyLeft" }, { 0x0c, 0x024f, "ACJustifyCenterH" }, { 0x0c, 0x0250, "ACJustifyRight" }, { 0x0c, 0x0251, "ACJustifyBlockH" }, { 0x0c, 0x0252, "ACJustifyTop" }, { 0x0c, 0x0253, "ACJustifyCenterV" }, { 0x0c, 0x0254, "ACJustifyBottom" }, { 0x0c, 0x0255, "ACJustifyBlockV" }, { 0x0c, 0x0256, "ACIndentDecrease" }, { 0x0c, 0x0257, "ACIndentIncrease" }, { 0x0c, 0x0258, "ACNumberedList" }, { 0x0c, 0x0259, "ACRestartNumbering" }, { 0x0c, 0x025a, "ACBulletedList" }, { 0x0c, 0x025b, "ACPromote" }, { 0x0c, 0x025c, "ACDemote" }, { 0x0c, 0x025d, "ACYes" }, { 0x0c, 0x025e, "ACNo" }, { 0x0c, 0x025f, "ACCancel" }, { 0x0c, 0x0260, "ACCatalog" }, { 0x0c, 0x0261, "ACBuyCheckout" }, { 0x0c, 0x0262, "ACAddtoCart" }, { 0x0c, 0x0263, "ACExpand" }, { 0x0c, 0x0264, "ACExpandAll" }, { 0x0c, 0x0265, "ACCollapse" }, { 0x0c, 0x0266, "ACCollapseAll" }, { 0x0c, 0x0267, "ACPrintPreview" }, { 0x0c, 0x0268, "ACPasteSpecial" }, { 0x0c, 0x0269, "ACInsertMode" }, { 0x0c, 0x026a, "ACDelete" }, { 0x0c, 0x026b, "ACLock" }, { 0x0c, 0x026c, "ACUnlock" }, { 0x0c, 0x026d, "ACProtect" }, { 0x0c, 0x026e, "ACUnprotect" }, { 0x0c, 0x026f, "ACAttachComment" }, { 0x0c, 0x0270, "ACDeleteComment" }, { 0x0c, 0x0271, "ACViewComment" }, { 0x0c, 0x0272, "ACSelectWord" }, { 0x0c, 0x0273, "ACSelectSentence" }, { 0x0c, 0x0274, "ACSelectParagraph" }, { 0x0c, 0x0275, "ACSelectColumn" }, { 0x0c, 0x0276, "ACSelectRow" }, { 0x0c, 0x0277, "ACSelectTable" }, { 0x0c, 0x0278, "ACSelectObject" }, { 0x0c, 0x0279, "ACRedoRepeat" }, { 0x0c, 0x027a, "ACSort" }, { 0x0c, 0x027b, "ACSortAscending" }, { 0x0c, 0x027c, "ACSortDescending" }, { 0x0c, 0x027d, "ACFilter" }, { 0x0c, 0x027e, "ACSetClock" }, { 0x0c, 0x027f, "ACViewClock" }, { 0x0c, 0x0280, "ACSelectTimeZone" }, { 0x0c, 0x0281, "ACEditTimeZones" }, { 0x0c, 0x0282, "ACSetAlarm" }, { 0x0c, 0x0283, "ACClearAlarm" }, { 0x0c, 0x0284, "ACSnoozeAlarm" }, { 0x0c, 0x0285, "ACResetAlarm" }, { 0x0c, 0x0286, "ACSynchronize" }, { 0x0c, 0x0287, "ACSendReceive" }, { 0x0c, 0x0288, "ACSendTo" }, { 0x0c, 0x0289, "ACReply" }, { 0x0c, 0x028a, "ACReplyAll" }, { 0x0c, 0x028b, "ACForwardMsg" }, { 0x0c, 0x028c, "ACSend" }, { 0x0c, 0x028d, "ACAttachFile" }, { 0x0c, 0x028e, "ACUpload" }, { 0x0c, 0x028f, "ACDownloadSaveTargetAs" }, { 0x0c, 0x0290, "ACSetBorders" }, { 0x0c, 0x0291, "ACInsertRow" }, { 0x0c, 0x0292, "ACInsertColumn" }, { 0x0c, 0x0293, "ACInsertFile" }, { 0x0c, 0x0294, "ACInsertPicture" }, { 0x0c, 0x0295, "ACInsertObject" }, { 0x0c, 0x0296, "ACInsertSymbol" }, { 0x0c, 0x0297, "ACSaveandClose" }, { 0x0c, 0x0298, "ACRename" }, { 0x0c, 0x0299, "ACMerge" }, { 0x0c, 0x029a, "ACSplit" }, { 0x0c, 0x029b, "ACDisributeHorizontally" }, { 0x0c, 0x029c, "ACDistributeVertically" }, { 0x0c, 0x029d, "ACNextKeyboardLayoutSelect" }, { 0x0c, 0x029e, "ACNavigationGuidance" }, { 0x0c, 0x029f, "ACDesktopShowAllWindows" }, { 0x0c, 0x02a0, "ACSoftKeyLeft" }, { 0x0c, 0x02a1, "ACSoftKeyRight" }, { 0x0c, 0x02a2, "ACDesktopShowAllApplications" }, { 0x0c, 0x02b0, "ACIdleKeepAlive" }, { 0x0c, 0x02c0, "ExtendedKeyboardAttributesCollection" }, { 0x0c, 0x02c1, "KeyboardFormFactor" }, { 0x0c, 0x02c2, "KeyboardKeyType" }, { 0x0c, 0x02c3, "KeyboardPhysicalLayout" }, { 0x0c, 0x02c4, "VendorSpecificKeyboardPhysicalLayout" }, { 0x0c, 0x02c5, "KeyboardIETFLanguageTagIndex" }, { 0x0c, 0x02c6, "ImplementedKeyboardInputAssistControls" }, { 0x0c, 0x02c7, "KeyboardInputAssistPrevious" }, { 0x0c, 0x02c8, "KeyboardInputAssistNext" }, { 0x0c, 0x02c9, "KeyboardInputAssistPreviousGroup" }, { 0x0c, 0x02ca, "KeyboardInputAssistNextGroup" }, { 0x0c, 0x02cb, "KeyboardInputAssistAccept" }, { 0x0c, 0x02cc, "KeyboardInputAssistCancel" }, { 0x0c, 0x02d0, "PrivacyScreenToggle" }, { 0x0c, 0x02d1, "PrivacyScreenLevelDecrement" }, { 0x0c, 0x02d2, "PrivacyScreenLevelIncrement" }, { 0x0c, 0x02d3, "PrivacyScreenLevelMinimum" }, { 0x0c, 0x02d4, "PrivacyScreenLevelMaximum" }, { 0x0c, 0x0500, "ContactEdited" }, { 0x0c, 0x0501, "ContactAdded" }, { 0x0c, 0x0502, "ContactRecordActive" }, { 0x0c, 0x0503, "ContactIndex" }, { 0x0c, 0x0504, "ContactNickname" }, { 0x0c, 0x0505, "ContactFirstName" }, { 0x0c, 0x0506, "ContactLastName" }, { 0x0c, 0x0507, "ContactFullName" }, { 0x0c, 0x0508, "ContactPhoneNumberPersonal" }, { 0x0c, 0x0509, "ContactPhoneNumberBusiness" }, { 0x0c, 0x050a, "ContactPhoneNumberMobile" }, { 0x0c, 0x050b, "ContactPhoneNumberPager" }, { 0x0c, 0x050c, "ContactPhoneNumberFax" }, { 0x0c, 0x050d, "ContactPhoneNumberOther" }, { 0x0c, 0x050e, "ContactEmailPersonal" }, { 0x0c, 0x050f, "ContactEmailBusiness" }, { 0x0c, 0x0510, "ContactEmailOther" }, { 0x0c, 0x0511, "ContactEmailMain" }, { 0x0c, 0x0512, "ContactSpeedDialNumber" }, { 0x0c, 0x0513, "ContactStatusFlag" }, { 0x0c, 0x0514, "ContactMisc" }, { 0x0d, 0, "Digitizers" }, { 0x0d, 0x0001, "Digitizer" }, { 0x0d, 0x0002, "Pen" }, { 0x0d, 0x0003, "LightPen" }, { 0x0d, 0x0004, "TouchScreen" }, { 0x0d, 0x0005, "TouchPad" }, { 0x0d, 0x0006, "Whiteboard" }, { 0x0d, 0x0007, "CoordinateMeasuringMachine" }, { 0x0d, 0x0008, "3DDigitizer" }, { 0x0d, 0x0009, "StereoPlotter" }, { 0x0d, 0x000a, "ArticulatedArm" }, { 0x0d, 0x000b, "Armature" }, { 0x0d, 0x000c, "MultiplePointDigitizer" }, { 0x0d, 0x000d, "FreeSpaceWand" }, { 0x0d, 0x000e, "DeviceConfiguration" }, { 0x0d, 0x000f, "CapacitiveHeatMapDigitizer" }, { 0x0d, 0x0020, "Stylus" }, { 0x0d, 0x0021, "Puck" }, { 0x0d, 0x0022, "Finger" }, { 0x0d, 0x0023, "Devicesettings" }, { 0x0d, 0x0024, "CharacterGesture" }, { 0x0d, 0x0030, "TipPressure" }, { 0x0d, 0x0031, "BarrelPressure" }, { 0x0d, 0x0032, "InRange" }, { 0x0d, 0x0033, "Touch" }, { 0x0d, 0x0034, "Untouch" }, { 0x0d, 0x0035, "Tap" }, { 0x0d, 0x0036, "Quality" }, { 0x0d, 0x0037, "DataValid" }, { 0x0d, 0x0038, "TransducerIndex" }, { 0x0d, 0x0039, "TabletFunctionKeys" }, { 0x0d, 0x003a, "ProgramChangeKeys" }, { 0x0d, 0x003b, "BatteryStrength" }, { 0x0d, 0x003c, "Invert" }, { 0x0d, 0x003d, "XTilt" }, { 0x0d, 0x003e, "YTilt" }, { 0x0d, 0x003f, "Azimuth" }, { 0x0d, 0x0040, "Altitude" }, { 0x0d, 0x0041, "Twist" }, { 0x0d, 0x0042, "TipSwitch" }, { 0x0d, 0x0043, "SecondaryTipSwitch" }, { 0x0d, 0x0044, "BarrelSwitch" }, { 0x0d, 0x0045, "Eraser" }, { 0x0d, 0x0046, "TabletPick" }, { 0x0d, 0x0047, "TouchValid" }, { 0x0d, 0x0048, "Width" }, { 0x0d, 0x0049, "Height" }, { 0x0d, 0x0051, "ContactIdentifier" }, { 0x0d, 0x0052, "DeviceMode" }, { 0x0d, 0x0053, "DeviceIdentifier" }, { 0x0d, 0x0054, "ContactCount" }, { 0x0d, 0x0055, "ContactCountMaximum" }, { 0x0d, 0x0056, "ScanTime" }, { 0x0d, 0x0057, "SurfaceSwitch" }, { 0x0d, 0x0058, "ButtonSwitch" }, { 0x0d, 0x0059, "PadType" }, { 0x0d, 0x005a, "SecondaryBarrelSwitch" }, { 0x0d, 0x005b, "TransducerSerialNumber" }, { 0x0d, 0x005c, "PreferredColor" }, { 0x0d, 0x005d, "PreferredColorisLocked" }, { 0x0d, 0x005e, "PreferredLineWidth" }, { 0x0d, 0x005f, "PreferredLineWidthisLocked" }, { 0x0d, 0x0060, "LatencyMode" }, { 0x0d, 0x0061, "GestureCharacterQuality" }, { 0x0d, 0x0062, "CharacterGestureDataLength" }, { 0x0d, 0x0063, "CharacterGestureData" }, { 0x0d, 0x0064, "GestureCharacterEncoding" }, { 0x0d, 0x0065, "UTF8CharacterGestureEncoding" }, { 0x0d, 0x0066, "UTF16LittleEndianCharacterGestureEncoding" }, { 0x0d, 0x0067, "UTF16BigEndianCharacterGestureEncoding" }, { 0x0d, 0x0068, "UTF32LittleEndianCharacterGestureEncoding" }, { 0x0d, 0x0069, "UTF32BigEndianCharacterGestureEncoding" }, { 0x0d, 0x006a, "CapacitiveHeatMapProtocolVendorID" }, { 0x0d, 0x006b, "CapacitiveHeatMapProtocolVersion" }, { 0x0d, 0x006c, "CapacitiveHeatMapFrameData" }, { 0x0d, 0x006d, "GestureCharacterEnable" }, { 0x0d, 0x006e, "TransducerSerialNumberPart2" }, { 0x0d, 0x006f, "NoPreferredColor" }, { 0x0d, 0x0070, "PreferredLineStyle" }, { 0x0d, 0x0071, "PreferredLineStyleisLocked" }, { 0x0d, 0x0072, "Ink" }, { 0x0d, 0x0073, "Pencil" }, { 0x0d, 0x0074, "Highlighter" }, { 0x0d, 0x0075, "ChiselMarker" }, { 0x0d, 0x0076, "Brush" }, { 0x0d, 0x0077, "NoPreference" }, { 0x0d, 0x0080, "DigitizerDiagnostic" }, { 0x0d, 0x0081, "DigitizerError" }, { 0x0d, 0x0082, "ErrNormalStatus" }, { 0x0d, 0x0083, "ErrTransducersExceeded" }, { 0x0d, 0x0084, "ErrFullTransFeaturesUnavailable" }, { 0x0d, 0x0085, "ErrChargeLow" }, { 0x0d, 0x0090, "TransducerSoftwareInfo" }, { 0x0d, 0x0091, "TransducerVendorId" }, { 0x0d, 0x0092, "TransducerProductId" }, { 0x0d, 0x0093, "DeviceSupportedProtocols" }, { 0x0d, 0x0094, "TransducerSupportedProtocols" }, { 0x0d, 0x0095, "NoProtocol" }, { 0x0d, 0x0096, "WacomAESProtocol" }, { 0x0d, 0x0097, "USIProtocol" }, { 0x0d, 0x0098, "MicrosoftPenProtocol" }, { 0x0d, 0x00a0, "SupportedReportRates" }, { 0x0d, 0x00a1, "ReportRate" }, { 0x0d, 0x00a2, "TransducerConnected" }, { 0x0d, 0x00a3, "SwitchDisabled" }, { 0x0d, 0x00a4, "SwitchUnimplemented" }, { 0x0d, 0x00a5, "TransducerSwitches" }, { 0x0d, 0x00a6, "TransducerIndexSelector" }, { 0x0d, 0x00b0, "ButtonPressThreshold" }, { 0x0e, 0, "Haptics" }, { 0x0e, 0x0001, "SimpleHapticController" }, { 0x0e, 0x0010, "WaveformList" }, { 0x0e, 0x0011, "DurationList" }, { 0x0e, 0x0020, "AutoTrigger" }, { 0x0e, 0x0021, "ManualTrigger" }, { 0x0e, 0x0022, "AutoTriggerAssociatedControl" }, { 0x0e, 0x0023, "Intensity" }, { 0x0e, 0x0024, "RepeatCount" }, { 0x0e, 0x0025, "RetriggerPeriod" }, { 0x0e, 0x0026, "WaveformVendorPage" }, { 0x0e, 0x0027, "WaveformVendorID" }, { 0x0e, 0x0028, "WaveformCutoffTime" }, { 0x0e, 0x1001, "WaveformNone" }, { 0x0e, 0x1002, "WaveformStop" }, { 0x0e, 0x1003, "WaveformClick" }, { 0x0e, 0x1004, "WaveformBuzzContinuous" }, { 0x0e, 0x1005, "WaveformRumbleContinuous" }, { 0x0e, 0x1006, "WaveformPress" }, { 0x0e, 0x1007, "WaveformRelease" }, { 0x0e, 0x1008, "WaveformHover" }, { 0x0e, 0x1009, "WaveformSuccess" }, { 0x0e, 0x100a, "WaveformError" }, { 0x0e, 0x100b, "WaveformInkContinuous" }, { 0x0e, 0x100c, "WaveformPencilContinuous" }, { 0x0e, 0x100d, "WaveformMarkerContinuous" }, { 0x0e, 0x100e, "WaveformChiselMarkerContinuous" }, { 0x0e, 0x100f, "WaveformBrushContinuous" }, { 0x0e, 0x1010, "WaveformEraserContinuous" }, { 0x0e, 0x1011, "WaveformSparkleContinuous" }, { 0x0f, 0, "PhysicalInputDevice" }, { 0x0f, 0x0001, "PhysicalInputDevice" }, { 0x0f, 0x0020, "Normal" }, { 0x0f, 0x0021, "SetEffectReport" }, { 0x0f, 0x0022, "EffectParameterBlockIndex" }, { 0x0f, 0x0023, "ParameterBlockOffset" }, { 0x0f, 0x0024, "ROMFlag" }, { 0x0f, 0x0025, "EffectType" }, { 0x0f, 0x0026, "ETConstantForce" }, { 0x0f, 0x0027, "ETRamp" }, { 0x0f, 0x0028, "ETCustomForce" }, { 0x0f, 0x0030, "ETSquare" }, { 0x0f, 0x0031, "ETSine" }, { 0x0f, 0x0032, "ETTriangle" }, { 0x0f, 0x0033, "ETSawtoothUp" }, { 0x0f, 0x0034, "ETSawtoothDown" }, { 0x0f, 0x0040, "ETSpring" }, { 0x0f, 0x0041, "ETDamper" }, { 0x0f, 0x0042, "ETInertia" }, { 0x0f, 0x0043, "ETFriction" }, { 0x0f, 0x0050, "Duration" }, { 0x0f, 0x0051, "SamplePeriod" }, { 0x0f, 0x0052, "Gain" }, { 0x0f, 0x0053, "TriggerButton" }, { 0x0f, 0x0054, "TriggerRepeatInterval" }, { 0x0f, 0x0055, "AxesEnable" }, { 0x0f, 0x0056, "DirectionEnable" }, { 0x0f, 0x0057, "Direction" }, { 0x0f, 0x0058, "TypeSpecificBlockOffset" }, { 0x0f, 0x0059, "BlockType" }, { 0x0f, 0x005a, "SetEnvelopeReport" }, { 0x0f, 0x005b, "AttackLevel" }, { 0x0f, 0x005c, "AttackTime" }, { 0x0f, 0x005d, "FadeLevel" }, { 0x0f, 0x005e, "FadeTime" }, { 0x0f, 0x005f, "SetConditionReport" }, { 0x0f, 0x0060, "CenterPointOffset" }, { 0x0f, 0x0061, "PositiveCoefficient" }, { 0x0f, 0x0062, "NegativeCoefficient" }, { 0x0f, 0x0063, "PositiveSaturation" }, { 0x0f, 0x0064, "NegativeSaturation" }, { 0x0f, 0x0065, "DeadBand" }, { 0x0f, 0x0066, "DownloadForceSample" }, { 0x0f, 0x0067, "IsochCustomForceEnable" }, { 0x0f, 0x0068, "CustomForceDataReport" }, { 0x0f, 0x0069, "CustomForceData" }, { 0x0f, 0x006a, "CustomForceVendorDefinedData" }, { 0x0f, 0x006b, "SetCustomForceReport" }, { 0x0f, 0x006c, "CustomForceDataOffset" }, { 0x0f, 0x006d, "SampleCount" }, { 0x0f, 0x006e, "SetPeriodicReport" }, { 0x0f, 0x006f, "Offset" }, { 0x0f, 0x0070, "Magnitude" }, { 0x0f, 0x0071, "Phase" }, { 0x0f, 0x0072, "Period" }, { 0x0f, 0x0073, "SetConstantForceReport" }, { 0x0f, 0x0074, "SetRampForceReport" }, { 0x0f, 0x0075, "RampStart" }, { 0x0f, 0x0076, "RampEnd" }, { 0x0f, 0x0077, "EffectOperationReport" }, { 0x0f, 0x0078, "EffectOperation" }, { 0x0f, 0x0079, "OpEffectStart" }, { 0x0f, 0x007a, "OpEffectStartSolo" }, { 0x0f, 0x007b, "OpEffectStop" }, { 0x0f, 0x007c, "LoopCount" }, { 0x0f, 0x007d, "DeviceGainReport" }, { 0x0f, 0x007e, "DeviceGain" }, { 0x0f, 0x007f, "ParameterBlockPoolsReport" }, { 0x0f, 0x0080, "RAMPoolSize" }, { 0x0f, 0x0081, "ROMPoolSize" }, { 0x0f, 0x0082, "ROMEffectBlockCount" }, { 0x0f, 0x0083, "SimultaneousEffectsMax" }, { 0x0f, 0x0084, "PoolAlignment" }, { 0x0f, 0x0085, "ParameterBlockMoveReport" }, { 0x0f, 0x0086, "MoveSource" }, { 0x0f, 0x0087, "MoveDestination" }, { 0x0f, 0x0088, "MoveLength" }, { 0x0f, 0x0089, "EffectParameterBlockLoadReport" }, { 0x0f, 0x008b, "EffectParameterBlockLoadStatus" }, { 0x0f, 0x008c, "BlockLoadSuccess" }, { 0x0f, 0x008d, "BlockLoadFull" }, { 0x0f, 0x008e, "BlockLoadError" }, { 0x0f, 0x008f, "BlockHandle" }, { 0x0f, 0x0090, "EffectParameterBlockFreeReport" }, { 0x0f, 0x0091, "TypeSpecificBlockHandle" }, { 0x0f, 0x0092, "PIDStateReport" }, { 0x0f, 0x0094, "EffectPlaying" }, { 0x0f, 0x0095, "PIDDeviceControlReport" }, { 0x0f, 0x0096, "PIDDeviceControl" }, { 0x0f, 0x0097, "DCEnableActuators" }, { 0x0f, 0x0098, "DCDisableActuators" }, { 0x0f, 0x0099, "DCStopAllEffects" }, { 0x0f, 0x009a, "DCReset" }, { 0x0f, 0x009b, "DCPause" }, { 0x0f, 0x009c, "DCContinue" }, { 0x0f, 0x009f, "DevicePaused" }, { 0x0f, 0x00a0, "ActuatorsEnabled" }, { 0x0f, 0x00a4, "SafetySwitch" }, { 0x0f, 0x00a5, "ActuatorOverrideSwitch" }, { 0x0f, 0x00a6, "ActuatorPower" }, { 0x0f, 0x00a7, "StartDelay" }, { 0x0f, 0x00a8, "ParameterBlockSize" }, { 0x0f, 0x00a9, "DeviceManagedPool" }, { 0x0f, 0x00aa, "SharedParameterBlocks" }, { 0x0f, 0x00ab, "CreateNewEffectParameterBlockReport" }, { 0x0f, 0x00ac, "RAMPoolAvailable" }, { 0x11, 0, "SoC" }, { 0x11, 0x0001, "SocControl" }, { 0x11, 0x0002, "FirmwareTransfer" }, { 0x11, 0x0003, "FirmwareFileId" }, { 0x11, 0x0004, "FileOffsetInBytes" }, { 0x11, 0x0005, "FileTransferSizeMaxInBytes" }, { 0x11, 0x0006, "FilePayload" }, { 0x11, 0x0007, "FilePayloadSizeInBytes" }, { 0x11, 0x0008, "FilePayloadContainsLastBytes" }, { 0x11, 0x0009, "FileTransferStop" }, { 0x11, 0x000a, "FileTransferTillEnd" }, { 0x12, 0, "EyeandHeadTrackers" }, { 0x12, 0x0001, "EyeTracker" }, { 0x12, 0x0002, "HeadTracker" }, { 0x12, 0x0010, "TrackingData" }, { 0x12, 0x0011, "Capabilities" }, { 0x12, 0x0012, "Configuration" }, { 0x12, 0x0013, "Status" }, { 0x12, 0x0014, "Control" }, { 0x12, 0x0020, "SensorTimestamp" }, { 0x12, 0x0021, "PositionX" }, { 0x12, 0x0022, "PositionY" }, { 0x12, 0x0023, "PositionZ" }, { 0x12, 0x0024, "GazePoint" }, { 0x12, 0x0025, "LeftEyePosition" }, { 0x12, 0x0026, "RightEyePosition" }, { 0x12, 0x0027, "HeadPosition" }, { 0x12, 0x0028, "HeadDirectionPoint" }, { 0x12, 0x0029, "RotationaboutXaxis" }, { 0x12, 0x002a, "RotationaboutYaxis" }, { 0x12, 0x002b, "RotationaboutZaxis" }, { 0x12, 0x0100, "TrackerQuality" }, { 0x12, 0x0101, "MinimumTrackingDistance" }, { 0x12, 0x0102, "OptimumTrackingDistance" }, { 0x12, 0x0103, "MaximumTrackingDistance" }, { 0x12, 0x0104, "MaximumScreenPlaneWidth" }, { 0x12, 0x0105, "MaximumScreenPlaneHeight" }, { 0x12, 0x0200, "DisplayManufacturerID" }, { 0x12, 0x0201, "DisplayProductID" }, { 0x12, 0x0202, "DisplaySerialNumber" }, { 0x12, 0x0203, "DisplayManufacturerDate" }, { 0x12, 0x0204, "CalibratedScreenWidth" }, { 0x12, 0x0205, "CalibratedScreenHeight" }, { 0x12, 0x0300, "SamplingFrequency" }, { 0x12, 0x0301, "ConfigurationStatus" }, { 0x12, 0x0400, "DeviceModeRequest" }, { 0x14, 0, "AuxiliaryDisplay" }, { 0x14, 0x0001, "AlphanumericDisplay" }, { 0x14, 0x0002, "AuxiliaryDisplay" }, { 0x14, 0x0020, "DisplayAttributesReport" }, { 0x14, 0x0021, "ASCIICharacterSet" }, { 0x14, 0x0022, "DataReadBack" }, { 0x14, 0x0023, "FontReadBack" }, { 0x14, 0x0024, "DisplayControlReport" }, { 0x14, 0x0025, "ClearDisplay" }, { 0x14, 0x0026, "DisplayEnable" }, { 0x14, 0x0027, "ScreenSaverDelay" }, { 0x14, 0x0028, "ScreenSaverEnable" }, { 0x14, 0x0029, "VerticalScroll" }, { 0x14, 0x002a, "HorizontalScroll" }, { 0x14, 0x002b, "CharacterReport" }, { 0x14, 0x002c, "DisplayData" }, { 0x14, 0x002d, "DisplayStatus" }, { 0x14, 0x002e, "StatNotReady" }, { 0x14, 0x002f, "StatReady" }, { 0x14, 0x0030, "ErrNotaloadablecharacter" }, { 0x14, 0x0031, "ErrFontdatacannotberead" }, { 0x14, 0x0032, "CursorPositionReport" }, { 0x14, 0x0033, "Row" }, { 0x14, 0x0034, "Column" }, { 0x14, 0x0035, "Rows" }, { 0x14, 0x0036, "Columns" }, { 0x14, 0x0037, "CursorPixelPositioning" }, { 0x14, 0x0038, "CursorMode" }, { 0x14, 0x0039, "CursorEnable" }, { 0x14, 0x003a, "CursorBlink" }, { 0x14, 0x003b, "FontReport" }, { 0x14, 0x003c, "FontData" }, { 0x14, 0x003d, "CharacterWidth" }, { 0x14, 0x003e, "CharacterHeight" }, { 0x14, 0x003f, "CharacterSpacingHorizontal" }, { 0x14, 0x0040, "CharacterSpacingVertical" }, { 0x14, 0x0041, "UnicodeCharacterSet" }, { 0x14, 0x0042, "Font7Segment" }, { 0x14, 0x0043, "7SegmentDirectMap" }, { 0x14, 0x0044, "Font14Segment" }, { 0x14, 0x0045, "14SegmentDirectMap" }, { 0x14, 0x0046, "DisplayBrightness" }, { 0x14, 0x0047, "DisplayContrast" }, { 0x14, 0x0048, "CharacterAttribute" }, { 0x14, 0x0049, "AttributeReadback" }, { 0x14, 0x004a, "AttributeData" }, { 0x14, 0x004b, "CharAttrEnhance" }, { 0x14, 0x004c, "CharAttrUnderline" }, { 0x14, 0x004d, "CharAttrBlink" }, { 0x14, 0x0080, "BitmapSizeX" }, { 0x14, 0x0081, "BitmapSizeY" }, { 0x14, 0x0082, "MaxBlitSize" }, { 0x14, 0x0083, "BitDepthFormat" }, { 0x14, 0x0084, "DisplayOrientation" }, { 0x14, 0x0085, "PaletteReport" }, { 0x14, 0x0086, "PaletteDataSize" }, { 0x14, 0x0087, "PaletteDataOffset" }, { 0x14, 0x0088, "PaletteData" }, { 0x14, 0x008a, "BlitReport" }, { 0x14, 0x008b, "BlitRectangleX1" }, { 0x14, 0x008c, "BlitRectangleY1" }, { 0x14, 0x008d, "BlitRectangleX2" }, { 0x14, 0x008e, "BlitRectangleY2" }, { 0x14, 0x008f, "BlitData" }, { 0x14, 0x0090, "SoftButton" }, { 0x14, 0x0091, "SoftButtonID" }, { 0x14, 0x0092, "SoftButtonSide" }, { 0x14, 0x0093, "SoftButtonOffset1" }, { 0x14, 0x0094, "SoftButtonOffset2" }, { 0x14, 0x0095, "SoftButtonReport" }, { 0x14, 0x00c2, "SoftKeys" }, { 0x14, 0x00cc, "DisplayDataExtensions" }, { 0x14, 0x00cf, "CharacterMapping" }, { 0x14, 0x00dd, "UnicodeEquivalent" }, { 0x14, 0x00df, "CharacterPageMapping" }, { 0x14, 0x00ff, "RequestReport" }, { 0x20, 0, "Sensors" }, { 0x20, 0x0001, "Sensor" }, { 0x20, 0x0010, "Biometric" }, { 0x20, 0x0011, "BiometricHumanPresence" }, { 0x20, 0x0012, "BiometricHumanProximity" }, { 0x20, 0x0013, "BiometricHumanTouch" }, { 0x20, 0x0014, "BiometricBloodPressure" }, { 0x20, 0x0015, "BiometricBodyTemperature" }, { 0x20, 0x0016, "BiometricHeartRate" }, { 0x20, 0x0017, "BiometricHeartRateVariability" }, { 0x20, 0x0018, "BiometricPeripheralOxygenSaturation" }, { 0x20, 0x0019, "BiometricRespiratoryRate" }, { 0x20, 0x0020, "Electrical" }, { 0x20, 0x0021, "ElectricalCapacitance" }, { 0x20, 0x0022, "ElectricalCurrent" }, { 0x20, 0x0023, "ElectricalPower" }, { 0x20, 0x0024, "ElectricalInductance" }, { 0x20, 0x0025, "ElectricalResistance" }, { 0x20, 0x0026, "ElectricalVoltage" }, { 0x20, 0x0027, "ElectricalPotentiometer" }, { 0x20, 0x0028, "ElectricalFrequency" }, { 0x20, 0x0029, "ElectricalPeriod" }, { 0x20, 0x0030, "Environmental" }, { 0x20, 0x0031, "EnvironmentalAtmosphericPressure" }, { 0x20, 0x0032, "EnvironmentalHumidity" }, { 0x20, 0x0033, "EnvironmentalTemperature" }, { 0x20, 0x0034, "EnvironmentalWindDirection" }, { 0x20, 0x0035, "EnvironmentalWindSpeed" }, { 0x20, 0x0036, "EnvironmentalAirQuality" }, { 0x20, 0x0037, "EnvironmentalHeatIndex" }, { 0x20, 0x0038, "EnvironmentalSurfaceTemperature" }, { 0x20, 0x0039, "EnvironmentalVolatileOrganicCompounds" }, { 0x20, 0x003a, "EnvironmentalObjectPresence" }, { 0x20, 0x003b, "EnvironmentalObjectProximity" }, { 0x20, 0x0040, "Light" }, { 0x20, 0x0041, "LightAmbientLight" }, { 0x20, 0x0042, "LightConsumerInfrared" }, { 0x20, 0x0043, "LightInfraredLight" }, { 0x20, 0x0044, "LightVisibleLight" }, { 0x20, 0x0045, "LightUltravioletLight" }, { 0x20, 0x0050, "Location" }, { 0x20, 0x0051, "LocationBroadcast" }, { 0x20, 0x0052, "LocationDeadReckoning" }, { 0x20, 0x0053, "LocationGPSGlobalPositioningSystem" }, { 0x20, 0x0054, "LocationLookup" }, { 0x20, 0x0055, "LocationOther" }, { 0x20, 0x0056, "LocationStatic" }, { 0x20, 0x0057, "LocationTriangulation" }, { 0x20, 0x0060, "Mechanical" }, { 0x20, 0x0061, "MechanicalBooleanSwitch" }, { 0x20, 0x0062, "MechanicalBooleanSwitchArray" }, { 0x20, 0x0063, "MechanicalMultivalueSwitch" }, { 0x20, 0x0064, "MechanicalForce" }, { 0x20, 0x0065, "MechanicalPressure" }, { 0x20, 0x0066, "MechanicalStrain" }, { 0x20, 0x0067, "MechanicalWeight" }, { 0x20, 0x0068, "MechanicalHapticVibrator" }, { 0x20, 0x0069, "MechanicalHallEffectSwitch" }, { 0x20, 0x0070, "Motion" }, { 0x20, 0x0071, "MotionAccelerometer1D" }, { 0x20, 0x0072, "MotionAccelerometer2D" }, { 0x20, 0x0073, "MotionAccelerometer3D" }, { 0x20, 0x0074, "MotionGyrometer1D" }, { 0x20, 0x0075, "MotionGyrometer2D" }, { 0x20, 0x0076, "MotionGyrometer3D" }, { 0x20, 0x0077, "MotionMotionDetector" }, { 0x20, 0x0078, "MotionSpeedometer" }, { 0x20, 0x0079, "MotionAccelerometer" }, { 0x20, 0x007a, "MotionGyrometer" }, { 0x20, 0x007b, "MotionGravityVector" }, { 0x20, 0x007c, "MotionLinearAccelerometer" }, { 0x20, 0x0080, "Orientation" }, { 0x20, 0x0081, "OrientationCompass1D" }, { 0x20, 0x0082, "OrientationCompass2D" }, { 0x20, 0x0083, "OrientationCompass3D" }, { 0x20, 0x0084, "OrientationInclinometer1D" }, { 0x20, 0x0085, "OrientationInclinometer2D" }, { 0x20, 0x0086, "OrientationInclinometer3D" }, { 0x20, 0x0087, "OrientationDistance1D" }, { 0x20, 0x0088, "OrientationDistance2D" }, { 0x20, 0x0089, "OrientationDistance3D" }, { 0x20, 0x008a, "OrientationDeviceOrientation" }, { 0x20, 0x008b, "OrientationCompass" }, { 0x20, 0x008c, "OrientationInclinometer" }, { 0x20, 0x008d, "OrientationDistance" }, { 0x20, 0x008e, "OrientationRelativeOrientation" }, { 0x20, 0x008f, "OrientationSimpleOrientation" }, { 0x20, 0x0090, "Scanner" }, { 0x20, 0x0091, "ScannerBarcode" }, { 0x20, 0x0092, "ScannerRFID" }, { 0x20, 0x0093, "ScannerNFC" }, { 0x20, 0x00a0, "Time" }, { 0x20, 0x00a1, "TimeAlarmTimer" }, { 0x20, 0x00a2, "TimeRealTimeClock" }, { 0x20, 0x00b0, "PersonalActivity" }, { 0x20, 0x00b1, "PersonalActivityActivityDetection" }, { 0x20, 0x00b2, "PersonalActivityDevicePosition" }, { 0x20, 0x00b3, "PersonalActivityFloorTracker" }, { 0x20, 0x00b4, "PersonalActivityPedometer" }, { 0x20, 0x00b5, "PersonalActivityStepDetection" }, { 0x20, 0x00c0, "OrientationExtended" }, { 0x20, 0x00c1, "OrientationExtendedGeomagneticOrientation" }, { 0x20, 0x00c2, "OrientationExtendedMagnetometer" }, { 0x20, 0x00d0, "Gesture" }, { 0x20, 0x00d1, "GestureChassisFlipGesture" }, { 0x20, 0x00d2, "GestureHingeFoldGesture" }, { 0x20, 0x00e0, "Other" }, { 0x20, 0x00e1, "OtherCustom" }, { 0x20, 0x00e2, "OtherGeneric" }, { 0x20, 0x00e3, "OtherGenericEnumerator" }, { 0x20, 0x00e4, "OtherHingeAngle" }, { 0x20, 0x00f0, "VendorReserved1" }, { 0x20, 0x00f1, "VendorReserved2" }, { 0x20, 0x00f2, "VendorReserved3" }, { 0x20, 0x00f3, "VendorReserved4" }, { 0x20, 0x00f4, "VendorReserved5" }, { 0x20, 0x00f5, "VendorReserved6" }, { 0x20, 0x00f6, "VendorReserved7" }, { 0x20, 0x00f7, "VendorReserved8" }, { 0x20, 0x00f8, "VendorReserved9" }, { 0x20, 0x00f9, "VendorReserved10" }, { 0x20, 0x00fa, "VendorReserved11" }, { 0x20, 0x00fb, "VendorReserved12" }, { 0x20, 0x00fc, "VendorReserved13" }, { 0x20, 0x00fd, "VendorReserved14" }, { 0x20, 0x00fe, "VendorReserved15" }, { 0x20, 0x00ff, "VendorReserved16" }, { 0x20, 0x0200, "Event" }, { 0x20, 0x0201, "EventSensorState" }, { 0x20, 0x0202, "EventSensorEvent" }, { 0x20, 0x0300, "Property" }, { 0x20, 0x0301, "PropertyFriendlyName" }, { 0x20, 0x0302, "PropertyPersistentUniqueID" }, { 0x20, 0x0303, "PropertySensorStatus" }, { 0x20, 0x0304, "PropertyMinimumReportInterval" }, { 0x20, 0x0305, "PropertySensorManufacturer" }, { 0x20, 0x0306, "PropertySensorModel" }, { 0x20, 0x0307, "PropertySensorSerialNumber" }, { 0x20, 0x0308, "PropertySensorDescription" }, { 0x20, 0x0309, "PropertySensorConnectionType" }, { 0x20, 0x030a, "PropertySensorDevicePath" }, { 0x20, 0x030b, "PropertyHardwareRevision" }, { 0x20, 0x030c, "PropertyFirmwareVersion" }, { 0x20, 0x030d, "PropertyReleaseDate" }, { 0x20, 0x030e, "PropertyReportInterval" }, { 0x20, 0x030f, "PropertyChangeSensitivityAbsolute" }, { 0x20, 0x0310, "PropertyChangeSensitivityPercentofRange" }, { 0x20, 0x0311, "PropertyChangeSensitivityPercentRelative" }, { 0x20, 0x0312, "PropertyAccuracy" }, { 0x20, 0x0313, "PropertyResolution" }, { 0x20, 0x0314, "PropertyMaximum" }, { 0x20, 0x0315, "PropertyMinimum" }, { 0x20, 0x0316, "PropertyReportingState" }, { 0x20, 0x0317, "PropertySamplingRate" }, { 0x20, 0x0318, "PropertyResponseCurve" }, { 0x20, 0x0319, "PropertyPowerState" }, { 0x20, 0x031a, "PropertyMaximumFIFOEvents" }, { 0x20, 0x031b, "PropertyReportLatency" }, { 0x20, 0x031c, "PropertyFlushFIFOEvents" }, { 0x20, 0x031d, "PropertyMaximumPowerConsumption" }, { 0x20, 0x031e, "PropertyIsPrimary" }, { 0x20, 0x031f, "PropertyHumanPresenceDetectionType" }, { 0x20, 0x0400, "DataFieldLocation" }, { 0x20, 0x0402, "DataFieldAltitudeAntennaSeaLevel" }, { 0x20, 0x0403, "DataFieldDifferentialReferenceStationID" }, { 0x20, 0x0404, "DataFieldAltitudeEllipsoidError" }, { 0x20, 0x0405, "DataFieldAltitudeEllipsoid" }, { 0x20, 0x0406, "DataFieldAltitudeSeaLevelError" }, { 0x20, 0x0407, "DataFieldAltitudeSeaLevel" }, { 0x20, 0x0408, "DataFieldDifferentialGPSDataAge" }, { 0x20, 0x0409, "DataFieldErrorRadius" }, { 0x20, 0x040a, "DataFieldFixQuality" }, { 0x20, 0x040b, "DataFieldFixType" }, { 0x20, 0x040c, "DataFieldGeoidalSeparation" }, { 0x20, 0x040d, "DataFieldGPSOperationMode" }, { 0x20, 0x040e, "DataFieldGPSSelectionMode" }, { 0x20, 0x040f, "DataFieldGPSStatus" }, { 0x20, 0x0410, "DataFieldPositionDilutionofPrecision" }, { 0x20, 0x0411, "DataFieldHorizontalDilutionofPrecision" }, { 0x20, 0x0412, "DataFieldVerticalDilutionofPrecision" }, { 0x20, 0x0413, "DataFieldLatitude" }, { 0x20, 0x0414, "DataFieldLongitude" }, { 0x20, 0x0415, "DataFieldTrueHeading" }, { 0x20, 0x0416, "DataFieldMagneticHeading" }, { 0x20, 0x0417, "DataFieldMagneticVariation" }, { 0x20, 0x0418, "DataFieldSpeed" }, { 0x20, 0x0419, "DataFieldSatellitesinView" }, { 0x20, 0x041a, "DataFieldSatellitesinViewAzimuth" }, { 0x20, 0x041b, "DataFieldSatellitesinViewElevation" }, { 0x20, 0x041c, "DataFieldSatellitesinViewIDs" }, { 0x20, 0x041d, "DataFieldSatellitesinViewPRNs" }, { 0x20, 0x041e, "DataFieldSatellitesinViewSNRatios" }, { 0x20, 0x041f, "DataFieldSatellitesUsedCount" }, { 0x20, 0x0420, "DataFieldSatellitesUsedPRNs" }, { 0x20, 0x0421, "DataFieldNMEASentence" }, { 0x20, 0x0422, "DataFieldAddressLine1" }, { 0x20, 0x0423, "DataFieldAddressLine2" }, { 0x20, 0x0424, "DataFieldCity" }, { 0x20, 0x0425, "DataFieldStateorProvince" }, { 0x20, 0x0426, "DataFieldCountryorRegion" }, { 0x20, 0x0427, "DataFieldPostalCode" }, { 0x20, 0x042a, "PropertyLocation" }, { 0x20, 0x042b, "PropertyLocationDesiredAccuracy" }, { 0x20, 0x0430, "DataFieldEnvironmental" }, { 0x20, 0x0431, "DataFieldAtmosphericPressure" }, { 0x20, 0x0433, "DataFieldRelativeHumidity" }, { 0x20, 0x0434, "DataFieldTemperature" }, { 0x20, 0x0435, "DataFieldWindDirection" }, { 0x20, 0x0436, "DataFieldWindSpeed" }, { 0x20, 0x0437, "DataFieldAirQualityIndex" }, { 0x20, 0x0438, "DataFieldEquivalentCO2" }, { 0x20, 0x0439, "DataFieldVolatileOrganicCompoundConcentration" }, { 0x20, 0x043a, "DataFieldObjectPresence" }, { 0x20, 0x043b, "DataFieldObjectProximityRange" }, { 0x20, 0x043c, "DataFieldObjectProximityOutofRange" }, { 0x20, 0x0440, "PropertyEnvironmental" }, { 0x20, 0x0441, "PropertyReferencePressure" }, { 0x20, 0x0450, "DataFieldMotion" }, { 0x20, 0x0451, "DataFieldMotionState" }, { 0x20, 0x0452, "DataFieldAcceleration" }, { 0x20, 0x0453, "DataFieldAccelerationAxisX" }, { 0x20, 0x0454, "DataFieldAccelerationAxisY" }, { 0x20, 0x0455, "DataFieldAccelerationAxisZ" }, { 0x20, 0x0456, "DataFieldAngularVelocity" }, { 0x20, 0x0457, "DataFieldAngularVelocityaboutXAxis" }, { 0x20, 0x0458, "DataFieldAngularVelocityaboutYAxis" }, { 0x20, 0x0459, "DataFieldAngularVelocityaboutZAxis" }, { 0x20, 0x045a, "DataFieldAngularPosition" }, { 0x20, 0x045b, "DataFieldAngularPositionaboutXAxis" }, { 0x20, 0x045c, "DataFieldAngularPositionaboutYAxis" }, { 0x20, 0x045d, "DataFieldAngularPositionaboutZAxis" }, { 0x20, 0x045e, "DataFieldMotionSpeed" }, { 0x20, 0x045f, "DataFieldMotionIntensity" }, { 0x20, 0x0470, "DataFieldOrientation" }, { 0x20, 0x0471, "DataFieldHeading" }, { 0x20, 0x0472, "DataFieldHeadingXAxis" }, { 0x20, 0x0473, "DataFieldHeadingYAxis" }, { 0x20, 0x0474, "DataFieldHeadingZAxis" }, { 0x20, 0x0475, "DataFieldHeadingCompensatedMagneticNorth" }, { 0x20, 0x0476, "DataFieldHeadingCompensatedTrueNorth" }, { 0x20, 0x0477, "DataFieldHeadingMagneticNorth" }, { 0x20, 0x0478, "DataFieldHeadingTrueNorth" }, { 0x20, 0x0479, "DataFieldDistance" }, { 0x20, 0x047a, "DataFieldDistanceXAxis" }, { 0x20, 0x047b, "DataFieldDistanceYAxis" }, { 0x20, 0x047c, "DataFieldDistanceZAxis" }, { 0x20, 0x047d, "DataFieldDistanceOutofRange" }, { 0x20, 0x047e, "DataFieldTilt" }, { 0x20, 0x047f, "DataFieldTiltXAxis" }, { 0x20, 0x0480, "DataFieldTiltYAxis" }, { 0x20, 0x0481, "DataFieldTiltZAxis" }, { 0x20, 0x0482, "DataFieldRotationMatrix" }, { 0x20, 0x0483, "DataFieldQuaternion" }, { 0x20, 0x0484, "DataFieldMagneticFlux" }, { 0x20, 0x0485, "DataFieldMagneticFluxXAxis" }, { 0x20, 0x0486, "DataFieldMagneticFluxYAxis" }, { 0x20, 0x0487, "DataFieldMagneticFluxZAxis" }, { 0x20, 0x0488, "DataFieldMagnetometerAccuracy" }, { 0x20, 0x0489, "DataFieldSimpleOrientationDirection" }, { 0x20, 0x0490, "DataFieldMechanical" }, { 0x20, 0x0491, "DataFieldBooleanSwitchState" }, { 0x20, 0x0492, "DataFieldBooleanSwitchArrayStates" }, { 0x20, 0x0493, "DataFieldMultivalueSwitchValue" }, { 0x20, 0x0494, "DataFieldForce" }, { 0x20, 0x0495, "DataFieldAbsolutePressure" }, { 0x20, 0x0496, "DataFieldGaugePressure" }, { 0x20, 0x0497, "DataFieldStrain" }, { 0x20, 0x0498, "DataFieldWeight" }, { 0x20, 0x04a0, "PropertyMechanical" }, { 0x20, 0x04a1, "PropertyVibrationState" }, { 0x20, 0x04a2, "PropertyForwardVibrationSpeed" }, { 0x20, 0x04a3, "PropertyBackwardVibrationSpeed" }, { 0x20, 0x04b0, "DataFieldBiometric" }, { 0x20, 0x04b1, "DataFieldHumanPresence" }, { 0x20, 0x04b2, "DataFieldHumanProximityRange" }, { 0x20, 0x04b3, "DataFieldHumanProximityOutofRange" }, { 0x20, 0x04b4, "DataFieldHumanTouchState" }, { 0x20, 0x04b5, "DataFieldBloodPressure" }, { 0x20, 0x04b6, "DataFieldBloodPressureDiastolic" }, { 0x20, 0x04b7, "DataFieldBloodPressureSystolic" }, { 0x20, 0x04b8, "DataFieldHeartRate" }, { 0x20, 0x04b9, "DataFieldRestingHeartRate" }, { 0x20, 0x04ba, "DataFieldHeartbeatInterval" }, { 0x20, 0x04bb, "DataFieldRespiratoryRate" }, { 0x20, 0x04bc, "DataFieldSpO2" }, { 0x20, 0x04bd, "DataFieldHumanAttentionDetected" }, { 0x20, 0x04be, "DataFieldHumanHeadAzimuth" }, { 0x20, 0x04bf, "DataFieldHumanHeadAltitude" }, { 0x20, 0x04c0, "DataFieldHumanHeadRoll" }, { 0x20, 0x04c1, "DataFieldHumanHeadPitch" }, { 0x20, 0x04c2, "DataFieldHumanHeadYaw" }, { 0x20, 0x04c3, "DataFieldHumanCorrelationId" }, { 0x20, 0x04d0, "DataFieldLight" }, { 0x20, 0x04d1, "DataFieldIlluminance" }, { 0x20, 0x04d2, "DataFieldColorTemperature" }, { 0x20, 0x04d3, "DataFieldChromaticity" }, { 0x20, 0x04d4, "DataFieldChromaticityX" }, { 0x20, 0x04d5, "DataFieldChromaticityY" }, { 0x20, 0x04d6, "DataFieldConsumerIRSentenceReceive" }, { 0x20, 0x04d7, "DataFieldInfraredLight" }, { 0x20, 0x04d8, "DataFieldRedLight" }, { 0x20, 0x04d9, "DataFieldGreenLight" }, { 0x20, 0x04da, "DataFieldBlueLight" }, { 0x20, 0x04db, "DataFieldUltravioletALight" }, { 0x20, 0x04dc, "DataFieldUltravioletBLight" }, { 0x20, 0x04dd, "DataFieldUltravioletIndex" }, { 0x20, 0x04de, "DataFieldNearInfraredLight" }, { 0x20, 0x04df, "PropertyLight" }, { 0x20, 0x04e0, "PropertyConsumerIRSentenceSend" }, { 0x20, 0x04e2, "PropertyAutoBrightnessPreferred" }, { 0x20, 0x04e3, "PropertyAutoColorPreferred" }, { 0x20, 0x04f0, "DataFieldScanner" }, { 0x20, 0x04f1, "DataFieldRFIDTag40Bit" }, { 0x20, 0x04f2, "DataFieldNFCSentenceReceive" }, { 0x20, 0x04f8, "PropertyScanner" }, { 0x20, 0x04f9, "PropertyNFCSentenceSend" }, { 0x20, 0x0500, "DataFieldElectrical" }, { 0x20, 0x0501, "DataFieldCapacitance" }, { 0x20, 0x0502, "DataFieldCurrent" }, { 0x20, 0x0503, "DataFieldElectricalPower" }, { 0x20, 0x0504, "DataFieldInductance" }, { 0x20, 0x0505, "DataFieldResistance" }, { 0x20, 0x0506, "DataFieldVoltage" }, { 0x20, 0x0507, "DataFieldFrequency" }, { 0x20, 0x0508, "DataFieldPeriod" }, { 0x20, 0x0509, "DataFieldPercentofRange" }, { 0x20, 0x0520, "DataFieldTime" }, { 0x20, 0x0521, "DataFieldYear" }, { 0x20, 0x0522, "DataFieldMonth" }, { 0x20, 0x0523, "DataFieldDay" }, { 0x20, 0x0524, "DataFieldDayofWeek" }, { 0x20, 0x0525, "DataFieldHour" }, { 0x20, 0x0526, "DataFieldMinute" }, { 0x20, 0x0527, "DataFieldSecond" }, { 0x20, 0x0528, "DataFieldMillisecond" }, { 0x20, 0x0529, "DataFieldTimestamp" }, { 0x20, 0x052a, "DataFieldJulianDayofYear" }, { 0x20, 0x052b, "DataFieldTimeSinceSystemBoot" }, { 0x20, 0x0530, "PropertyTime" }, { 0x20, 0x0531, "PropertyTimeZoneOffsetfromUTC" }, { 0x20, 0x0532, "PropertyTimeZoneName" }, { 0x20, 0x0533, "PropertyDaylightSavingsTimeObserved" }, { 0x20, 0x0534, "PropertyTimeTrimAdjustment" }, { 0x20, 0x0535, "PropertyArmAlarm" }, { 0x20, 0x0540, "DataFieldCustom" }, { 0x20, 0x0541, "DataFieldCustomUsage" }, { 0x20, 0x0542, "DataFieldCustomBooleanArray" }, { 0x20, 0x0543, "DataFieldCustomValue" }, { 0x20, 0x0544, "DataFieldCustomValue1" }, { 0x20, 0x0545, "DataFieldCustomValue2" }, { 0x20, 0x0546, "DataFieldCustomValue3" }, { 0x20, 0x0547, "DataFieldCustomValue4" }, { 0x20, 0x0548, "DataFieldCustomValue5" }, { 0x20, 0x0549, "DataFieldCustomValue6" }, { 0x20, 0x054a, "DataFieldCustomValue7" }, { 0x20, 0x054b, "DataFieldCustomValue8" }, { 0x20, 0x054c, "DataFieldCustomValue9" }, { 0x20, 0x054d, "DataFieldCustomValue10" }, { 0x20, 0x054e, "DataFieldCustomValue11" }, { 0x20, 0x054f, "DataFieldCustomValue12" }, { 0x20, 0x0550, "DataFieldCustomValue13" }, { 0x20, 0x0551, "DataFieldCustomValue14" }, { 0x20, 0x0552, "DataFieldCustomValue15" }, { 0x20, 0x0553, "DataFieldCustomValue16" }, { 0x20, 0x0554, "DataFieldCustomValue17" }, { 0x20, 0x0555, "DataFieldCustomValue18" }, { 0x20, 0x0556, "DataFieldCustomValue19" }, { 0x20, 0x0557, "DataFieldCustomValue20" }, { 0x20, 0x0558, "DataFieldCustomValue21" }, { 0x20, 0x0559, "DataFieldCustomValue22" }, { 0x20, 0x055a, "DataFieldCustomValue23" }, { 0x20, 0x055b, "DataFieldCustomValue24" }, { 0x20, 0x055c, "DataFieldCustomValue25" }, { 0x20, 0x055d, "DataFieldCustomValue26" }, { 0x20, 0x055e, "DataFieldCustomValue27" }, { 0x20, 0x055f, "DataFieldCustomValue28" }, { 0x20, 0x0560, "DataFieldGeneric" }, { 0x20, 0x0561, "DataFieldGenericGUIDorPROPERTYKEY" }, { 0x20, 0x0562, "DataFieldGenericCategoryGUID" }, { 0x20, 0x0563, "DataFieldGenericTypeGUID" }, { 0x20, 0x0564, "DataFieldGenericEventPROPERTYKEY" }, { 0x20, 0x0565, "DataFieldGenericPropertyPROPERTYKEY" }, { 0x20, 0x0566, "DataFieldGenericDataFieldPROPERTYKEY" }, { 0x20, 0x0567, "DataFieldGenericEvent" }, { 0x20, 0x0568, "DataFieldGenericProperty" }, { 0x20, 0x0569, "DataFieldGenericDataField" }, { 0x20, 0x056a, "DataFieldEnumeratorTableRowIndex" }, { 0x20, 0x056b, "DataFieldEnumeratorTableRowCount" }, { 0x20, 0x056c, "DataFieldGenericGUIDorPROPERTYKEYkind" }, { 0x20, 0x056d, "DataFieldGenericGUID" }, { 0x20, 0x056e, "DataFieldGenericPROPERTYKEY" }, { 0x20, 0x056f, "DataFieldGenericTopLevelCollectionID" }, { 0x20, 0x0570, "DataFieldGenericReportID" }, { 0x20, 0x0571, "DataFieldGenericReportItemPositionIndex" }, { 0x20, 0x0572, "DataFieldGenericFirmwareVARTYPE" }, { 0x20, 0x0573, "DataFieldGenericUnitofMeasure" }, { 0x20, 0x0574, "DataFieldGenericUnitExponent" }, { 0x20, 0x0575, "DataFieldGenericReportSize" }, { 0x20, 0x0576, "DataFieldGenericReportCount" }, { 0x20, 0x0580, "PropertyGeneric" }, { 0x20, 0x0581, "PropertyEnumeratorTableRowIndex" }, { 0x20, 0x0582, "PropertyEnumeratorTableRowCount" }, { 0x20, 0x0590, "DataFieldPersonalActivity" }, { 0x20, 0x0591, "DataFieldActivityType" }, { 0x20, 0x0592, "DataFieldActivityState" }, { 0x20, 0x0593, "DataFieldDevicePosition" }, { 0x20, 0x0594, "DataFieldStepCount" }, { 0x20, 0x0595, "DataFieldStepCountReset" }, { 0x20, 0x0596, "DataFieldStepDuration" }, { 0x20, 0x0597, "DataFieldStepType" }, { 0x20, 0x05a0, "PropertyMinimumActivityDetectionInterval" }, { 0x20, 0x05a1, "PropertySupportedActivityTypes" }, { 0x20, 0x05a2, "PropertySubscribedActivityTypes" }, { 0x20, 0x05a3, "PropertySupportedStepTypes" }, { 0x20, 0x05a4, "PropertySubscribedStepTypes" }, { 0x20, 0x05a5, "PropertyFloorHeight" }, { 0x20, 0x05b0, "DataFieldCustomTypeID" }, { 0x20, 0x05c0, "PropertyCustom" }, { 0x20, 0x05c1, "PropertyCustomValue1" }, { 0x20, 0x05c2, "PropertyCustomValue2" }, { 0x20, 0x05c3, "PropertyCustomValue3" }, { 0x20, 0x05c4, "PropertyCustomValue4" }, { 0x20, 0x05c5, "PropertyCustomValue5" }, { 0x20, 0x05c6, "PropertyCustomValue6" }, { 0x20, 0x05c7, "PropertyCustomValue7" }, { 0x20, 0x05c8, "PropertyCustomValue8" }, { 0x20, 0x05c9, "PropertyCustomValue9" }, { 0x20, 0x05ca, "PropertyCustomValue10" }, { 0x20, 0x05cb, "PropertyCustomValue11" }, { 0x20, 0x05cc, "PropertyCustomValue12" }, { 0x20, 0x05cd, "PropertyCustomValue13" }, { 0x20, 0x05ce, "PropertyCustomValue14" }, { 0x20, 0x05cf, "PropertyCustomValue15" }, { 0x20, 0x05d0, "PropertyCustomValue16" }, { 0x20, 0x05e0, "DataFieldHinge" }, { 0x20, 0x05e1, "DataFieldHingeAngle" }, { 0x20, 0x05f0, "DataFieldGestureSensor" }, { 0x20, 0x05f1, "DataFieldGestureState" }, { 0x20, 0x05f2, "DataFieldHingeFoldInitialAngle" }, { 0x20, 0x05f3, "DataFieldHingeFoldFinalAngle" }, { 0x20, 0x05f4, "DataFieldHingeFoldContributingPanel" }, { 0x20, 0x05f5, "DataFieldHingeFoldType" }, { 0x20, 0x0800, "SensorStateUndefined" }, { 0x20, 0x0801, "SensorStateReady" }, { 0x20, 0x0802, "SensorStateNotAvailable" }, { 0x20, 0x0803, "SensorStateNoData" }, { 0x20, 0x0804, "SensorStateInitializing" }, { 0x20, 0x0805, "SensorStateAccessDenied" }, { 0x20, 0x0806, "SensorStateError" }, { 0x20, 0x0810, "SensorEventUnknown" }, { 0x20, 0x0811, "SensorEventStateChanged" }, { 0x20, 0x0812, "SensorEventPropertyChanged" }, { 0x20, 0x0813, "SensorEventDataUpdated" }, { 0x20, 0x0814, "SensorEventPollResponse" }, { 0x20, 0x0815, "SensorEventChangeSensitivity" }, { 0x20, 0x0816, "SensorEventRangeMaximumReached" }, { 0x20, 0x0817, "SensorEventRangeMinimumReached" }, { 0x20, 0x0818, "SensorEventHighThresholdCrossUpward" }, { 0x20, 0x0819, "SensorEventHighThresholdCrossDownward" }, { 0x20, 0x081a, "SensorEventLowThresholdCrossUpward" }, { 0x20, 0x081b, "SensorEventLowThresholdCrossDownward" }, { 0x20, 0x081c, "SensorEventZeroThresholdCrossUpward" }, { 0x20, 0x081d, "SensorEventZeroThresholdCrossDownward" }, { 0x20, 0x081e, "SensorEventPeriodExceeded" }, { 0x20, 0x081f, "SensorEventFrequencyExceeded" }, { 0x20, 0x0820, "SensorEventComplexTrigger" }, { 0x20, 0x0830, "ConnectionTypePCIntegrated" }, { 0x20, 0x0831, "ConnectionTypePCAttached" }, { 0x20, 0x0832, "ConnectionTypePCExternal" }, { 0x20, 0x0840, "ReportingStateReportNoEvents" }, { 0x20, 0x0841, "ReportingStateReportAllEvents" }, { 0x20, 0x0842, "ReportingStateReportThresholdEvents" }, { 0x20, 0x0843, "ReportingStateWakeOnNoEvents" }, { 0x20, 0x0844, "ReportingStateWakeOnAllEvents" }, { 0x20, 0x0845, "ReportingStateWakeOnThresholdEvents" }, { 0x20, 0x0846, "ReportingStateAnytime" }, { 0x20, 0x0850, "PowerStateUndefined" }, { 0x20, 0x0851, "PowerStateD0FullPower" }, { 0x20, 0x0852, "PowerStateD1LowPower" }, { 0x20, 0x0853, "PowerStateD2StandbyPowerwithWakeup" }, { 0x20, 0x0854, "PowerStateD3SleepwithWakeup" }, { 0x20, 0x0855, "PowerStateD4PowerOff" }, { 0x20, 0x0860, "AccuracyDefault" }, { 0x20, 0x0861, "AccuracyHigh" }, { 0x20, 0x0862, "AccuracyMedium" }, { 0x20, 0x0863, "AccuracyLow" }, { 0x20, 0x0870, "FixQualityNoFix" }, { 0x20, 0x0871, "FixQualityGPS" }, { 0x20, 0x0872, "FixQualityDGPS" }, { 0x20, 0x0880, "FixTypeNoFix" }, { 0x20, 0x0881, "FixTypeGPSSPSModeFixValid" }, { 0x20, 0x0882, "FixTypeDGPSSPSModeFixValid" }, { 0x20, 0x0883, "FixTypeGPSPPSModeFixValid" }, { 0x20, 0x0884, "FixTypeRealTimeKinematic" }, { 0x20, 0x0885, "FixTypeFloatRTK" }, { 0x20, 0x0886, "FixTypeEstimateddeadreckoned" }, { 0x20, 0x0887, "FixTypeManualInputMode" }, { 0x20, 0x0888, "FixTypeSimulatorMode" }, { 0x20, 0x0890, "GPSOperationModeManual" }, { 0x20, 0x0891, "GPSOperationModeAutomatic" }, { 0x20, 0x08a0, "GPSSelectionModeAutonomous" }, { 0x20, 0x08a1, "GPSSelectionModeDGPS" }, { 0x20, 0x08a2, "GPSSelectionModeEstimateddeadreckoned" }, { 0x20, 0x08a3, "GPSSelectionModeManualInput" }, { 0x20, 0x08a4, "GPSSelectionModeSimulator" }, { 0x20, 0x08a5, "GPSSelectionModeDataNotValid" }, { 0x20, 0x08b0, "GPSStatusDataValid" }, { 0x20, 0x08b1, "GPSStatusDataNotValid" }, { 0x20, 0x08c0, "DayofWeekSunday" }, { 0x20, 0x08c1, "DayofWeekMonday" }, { 0x20, 0x08c2, "DayofWeekTuesday" }, { 0x20, 0x08c3, "DayofWeekWednesday" }, { 0x20, 0x08c4, "DayofWeekThursday" }, { 0x20, 0x08c5, "DayofWeekFriday" }, { 0x20, 0x08c6, "DayofWeekSaturday" }, { 0x20, 0x08d0, "KindCategory" }, { 0x20, 0x08d1, "KindType" }, { 0x20, 0x08d2, "KindEvent" }, { 0x20, 0x08d3, "KindProperty" }, { 0x20, 0x08d4, "KindDataField" }, { 0x20, 0x08e0, "MagnetometerAccuracyLow" }, { 0x20, 0x08e1, "MagnetometerAccuracyMedium" }, { 0x20, 0x08e2, "MagnetometerAccuracyHigh" }, { 0x20, 0x08f0, "SimpleOrientationDirectionNotRotated" }, { 0x20, 0x08f1, "SimpleOrientationDirectionRotated90DegreesCCW" }, { 0x20, 0x08f2, "SimpleOrientationDirectionRotated180DegreesCCW" }, { 0x20, 0x08f3, "SimpleOrientationDirectionRotated270DegreesCCW" }, { 0x20, 0x08f4, "SimpleOrientationDirectionFaceUp" }, { 0x20, 0x08f5, "SimpleOrientationDirectionFaceDown" }, { 0x20, 0x0900, "VT_NULL" }, { 0x20, 0x0901, "VT_BOOL" }, { 0x20, 0x0902, "VT_UI1" }, { 0x20, 0x0903, "VT_I1" }, { 0x20, 0x0904, "VT_UI2" }, { 0x20, 0x0905, "VT_I2" }, { 0x20, 0x0906, "VT_UI4" }, { 0x20, 0x0907, "VT_I4" }, { 0x20, 0x0908, "VT_UI8" }, { 0x20, 0x0909, "VT_I8" }, { 0x20, 0x090a, "VT_R4" }, { 0x20, 0x090b, "VT_R8" }, { 0x20, 0x090c, "VT_WSTR" }, { 0x20, 0x090d, "VT_STR" }, { 0x20, 0x090e, "VT_CLSID" }, { 0x20, 0x090f, "VT_VECTORVT_UI1" }, { 0x20, 0x0910, "VT_F16E0" }, { 0x20, 0x0911, "VT_F16E1" }, { 0x20, 0x0912, "VT_F16E2" }, { 0x20, 0x0913, "VT_F16E3" }, { 0x20, 0x0914, "VT_F16E4" }, { 0x20, 0x0915, "VT_F16E5" }, { 0x20, 0x0916, "VT_F16E6" }, { 0x20, 0x0917, "VT_F16E7" }, { 0x20, 0x0918, "VT_F16E8" }, { 0x20, 0x0919, "VT_F16E9" }, { 0x20, 0x091a, "VT_F16EA" }, { 0x20, 0x091b, "VT_F16EB" }, { 0x20, 0x091c, "VT_F16EC" }, { 0x20, 0x091d, "VT_F16ED" }, { 0x20, 0x091e, "VT_F16EE" }, { 0x20, 0x091f, "VT_F16EF" }, { 0x20, 0x0920, "VT_F32E0" }, { 0x20, 0x0921, "VT_F32E1" }, { 0x20, 0x0922, "VT_F32E2" }, { 0x20, 0x0923, "VT_F32E3" }, { 0x20, 0x0924, "VT_F32E4" }, { 0x20, 0x0925, "VT_F32E5" }, { 0x20, 0x0926, "VT_F32E6" }, { 0x20, 0x0927, "VT_F32E7" }, { 0x20, 0x0928, "VT_F32E8" }, { 0x20, 0x0929, "VT_F32E9" }, { 0x20, 0x092a, "VT_F32EA" }, { 0x20, 0x092b, "VT_F32EB" }, { 0x20, 0x092c, "VT_F32EC" }, { 0x20, 0x092d, "VT_F32ED" }, { 0x20, 0x092e, "VT_F32EE" }, { 0x20, 0x092f, "VT_F32EF" }, { 0x20, 0x0930, "ActivityTypeUnknown" }, { 0x20, 0x0931, "ActivityTypeStationary" }, { 0x20, 0x0932, "ActivityTypeFidgeting" }, { 0x20, 0x0933, "ActivityTypeWalking" }, { 0x20, 0x0934, "ActivityTypeRunning" }, { 0x20, 0x0935, "ActivityTypeInVehicle" }, { 0x20, 0x0936, "ActivityTypeBiking" }, { 0x20, 0x0937, "ActivityTypeIdle" }, { 0x20, 0x0940, "UnitNotSpecified" }, { 0x20, 0x0941, "UnitLux" }, { 0x20, 0x0942, "UnitDegreesKelvin" }, { 0x20, 0x0943, "UnitDegreesCelsius" }, { 0x20, 0x0944, "UnitPascal" }, { 0x20, 0x0945, "UnitNewton" }, { 0x20, 0x0946, "UnitMetersSecond" }, { 0x20, 0x0947, "UnitKilogram" }, { 0x20, 0x0948, "UnitMeter" }, { 0x20, 0x0949, "UnitMetersSecondSecond" }, { 0x20, 0x094a, "UnitFarad" }, { 0x20, 0x094b, "UnitAmpere" }, { 0x20, 0x094c, "UnitWatt" }, { 0x20, 0x094d, "UnitHenry" }, { 0x20, 0x094e, "UnitOhm" }, { 0x20, 0x094f, "UnitVolt" }, { 0x20, 0x0950, "UnitHertz" }, { 0x20, 0x0951, "UnitBar" }, { 0x20, 0x0952, "UnitDegreesAnticlockwise" }, { 0x20, 0x0953, "UnitDegreesClockwise" }, { 0x20, 0x0954, "UnitDegrees" }, { 0x20, 0x0955, "UnitDegreesSecond" }, { 0x20, 0x0956, "UnitDegreesSecondSecond" }, { 0x20, 0x0957, "UnitKnot" }, { 0x20, 0x0958, "UnitPercent" }, { 0x20, 0x0959, "UnitSecond" }, { 0x20, 0x095a, "UnitMillisecond" }, { 0x20, 0x095b, "UnitG" }, { 0x20, 0x095c, "UnitBytes" }, { 0x20, 0x095d, "UnitMilligauss" }, { 0x20, 0x095e, "UnitBits" }, { 0x20, 0x0960, "ActivityStateNoStateChange" }, { 0x20, 0x0961, "ActivityStateStartActivity" }, { 0x20, 0x0962, "ActivityStateEndActivity" }, { 0x20, 0x0970, "Exponent0" }, { 0x20, 0x0971, "Exponent1" }, { 0x20, 0x0972, "Exponent2" }, { 0x20, 0x0973, "Exponent3" }, { 0x20, 0x0974, "Exponent4" }, { 0x20, 0x0975, "Exponent5" }, { 0x20, 0x0976, "Exponent6" }, { 0x20, 0x0977, "Exponent7" }, { 0x20, 0x0978, "Exponent8" }, { 0x20, 0x0979, "Exponent9" }, { 0x20, 0x097a, "ExponentA" }, { 0x20, 0x097b, "ExponentB" }, { 0x20, 0x097c, "ExponentC" }, { 0x20, 0x097d, "ExponentD" }, { 0x20, 0x097e, "ExponentE" }, { 0x20, 0x097f, "ExponentF" }, { 0x20, 0x0980, "DevicePositionUnknown" }, { 0x20, 0x0981, "DevicePositionUnchanged" }, { 0x20, 0x0982, "DevicePositionOnDesk" }, { 0x20, 0x0983, "DevicePositionInHand" }, { 0x20, 0x0984, "DevicePositionMovinginBag" }, { 0x20, 0x0985, "DevicePositionStationaryinBag" }, { 0x20, 0x0990, "StepTypeUnknown" }, { 0x20, 0x0991, "StepTypeWalking" }, { 0x20, 0x0992, "StepTypeRunning" }, { 0x20, 0x09a0, "GestureStateUnknown" }, { 0x20, 0x09a1, "GestureStateStarted" }, { 0x20, 0x09a2, "GestureStateCompleted" }, { 0x20, 0x09a3, "GestureStateCancelled" }, { 0x20, 0x09b0, "HingeFoldContributingPanelUnknown" }, { 0x20, 0x09b1, "HingeFoldContributingPanelPanel1" }, { 0x20, 0x09b2, "HingeFoldContributingPanelPanel2" }, { 0x20, 0x09b3, "HingeFoldContributingPanelBoth" }, { 0x20, 0x09b4, "HingeFoldTypeUnknown" }, { 0x20, 0x09b5, "HingeFoldTypeIncreasing" }, { 0x20, 0x09b6, "HingeFoldTypeDecreasing" }, { 0x20, 0x09c0, "HumanPresenceDetectionTypeVendorDefinedNonBiometric" }, { 0x20, 0x09c1, "HumanPresenceDetectionTypeVendorDefinedBiometric" }, { 0x20, 0x09c2, "HumanPresenceDetectionTypeFacialBiometric" }, { 0x20, 0x09c3, "HumanPresenceDetectionTypeAudioBiometric" }, { 0x20, 0x1000, "ModifierChangeSensitivityAbsolute" }, { 0x20, 0x2000, "ModifierMaximum" }, { 0x20, 0x3000, "ModifierMinimum" }, { 0x20, 0x4000, "ModifierAccuracy" }, { 0x20, 0x5000, "ModifierResolution" }, { 0x20, 0x6000, "ModifierThresholdHigh" }, { 0x20, 0x7000, "ModifierThresholdLow" }, { 0x20, 0x8000, "ModifierCalibrationOffset" }, { 0x20, 0x9000, "ModifierCalibrationMultiplier" }, { 0x20, 0xa000, "ModifierReportInterval" }, { 0x20, 0xb000, "ModifierFrequencyMax" }, { 0x20, 0xc000, "ModifierPeriodMax" }, { 0x20, 0xd000, "ModifierChangeSensitivityPercentofRange" }, { 0x20, 0xe000, "ModifierChangeSensitivityPercentRelative" }, { 0x20, 0xf000, "ModifierVendorReserved" }, { 0x40, 0, "MedicalInstrument" }, { 0x40, 0x0001, "MedicalUltrasound" }, { 0x40, 0x0020, "VCRAcquisition" }, { 0x40, 0x0021, "FreezeThaw" }, { 0x40, 0x0022, "ClipStore" }, { 0x40, 0x0023, "Update" }, { 0x40, 0x0024, "Next" }, { 0x40, 0x0025, "Save" }, { 0x40, 0x0026, "Print" }, { 0x40, 0x0027, "MicrophoneEnable" }, { 0x40, 0x0040, "Cine" }, { 0x40, 0x0041, "TransmitPower" }, { 0x40, 0x0042, "Volume" }, { 0x40, 0x0043, "Focus" }, { 0x40, 0x0044, "Depth" }, { 0x40, 0x0060, "SoftStepPrimary" }, { 0x40, 0x0061, "SoftStepSecondary" }, { 0x40, 0x0070, "DepthGainCompensation" }, { 0x40, 0x0080, "ZoomSelect" }, { 0x40, 0x0081, "ZoomAdjust" }, { 0x40, 0x0082, "SpectralDopplerModeSelect" }, { 0x40, 0x0083, "SpectralDopplerAdjust" }, { 0x40, 0x0084, "ColorDopplerModeSelect" }, { 0x40, 0x0085, "ColorDopplerAdjust" }, { 0x40, 0x0086, "MotionModeSelect" }, { 0x40, 0x0087, "MotionModeAdjust" }, { 0x40, 0x0088, "2DModeSelect" }, { 0x40, 0x0089, "2DModeAdjust" }, { 0x40, 0x00a0, "SoftControlSelect" }, { 0x40, 0x00a1, "SoftControlAdjust" }, { 0x41, 0, "BrailleDisplay" }, { 0x41, 0x0001, "BrailleDisplay" }, { 0x41, 0x0002, "BrailleRow" }, { 0x41, 0x0003, "8DotBrailleCell" }, { 0x41, 0x0004, "6DotBrailleCell" }, { 0x41, 0x0005, "NumberofBrailleCells" }, { 0x41, 0x0006, "ScreenReaderControl" }, { 0x41, 0x0007, "ScreenReaderIdentifier" }, { 0x41, 0x00fa, "RouterSet1" }, { 0x41, 0x00fb, "RouterSet2" }, { 0x41, 0x00fc, "RouterSet3" }, { 0x41, 0x0100, "RouterKey" }, { 0x41, 0x0101, "RowRouterKey" }, { 0x41, 0x0200, "BrailleButtons" }, { 0x41, 0x0201, "BrailleKeyboardDot1" }, { 0x41, 0x0202, "BrailleKeyboardDot2" }, { 0x41, 0x0203, "BrailleKeyboardDot3" }, { 0x41, 0x0204, "BrailleKeyboardDot4" }, { 0x41, 0x0205, "BrailleKeyboardDot5" }, { 0x41, 0x0206, "BrailleKeyboardDot6" }, { 0x41, 0x0207, "BrailleKeyboardDot7" }, { 0x41, 0x0208, "BrailleKeyboardDot8" }, { 0x41, 0x0209, "BrailleKeyboardSpace" }, { 0x41, 0x020a, "BrailleKeyboardLeftSpace" }, { 0x41, 0x020b, "BrailleKeyboardRightSpace" }, { 0x41, 0x020c, "BrailleFaceControls" }, { 0x41, 0x020d, "BrailleLeftControls" }, { 0x41, 0x020e, "BrailleRightControls" }, { 0x41, 0x020f, "BrailleTopControls" }, { 0x41, 0x0210, "BrailleJoystickCenter" }, { 0x41, 0x0211, "BrailleJoystickUp" }, { 0x41, 0x0212, "BrailleJoystickDown" }, { 0x41, 0x0213, "BrailleJoystickLeft" }, { 0x41, 0x0214, "BrailleJoystickRight" }, { 0x41, 0x0215, "BrailleDPadCenter" }, { 0x41, 0x0216, "BrailleDPadUp" }, { 0x41, 0x0217, "BrailleDPadDown" }, { 0x41, 0x0218, "BrailleDPadLeft" }, { 0x41, 0x0219, "BrailleDPadRight" }, { 0x41, 0x021a, "BraillePanLeft" }, { 0x41, 0x021b, "BraillePanRight" }, { 0x41, 0x021c, "BrailleRockerUp" }, { 0x41, 0x021d, "BrailleRockerDown" }, { 0x41, 0x021e, "BrailleRockerPress" }, { 0x59, 0, "LightingAndIllumination" }, { 0x59, 0x0001, "LampArray" }, { 0x59, 0x0002, "LampArrayAttributesReport" }, { 0x59, 0x0003, "LampCount" }, { 0x59, 0x0004, "BoundingBoxWidthInMicrometers" }, { 0x59, 0x0005, "BoundingBoxHeightInMicrometers" }, { 0x59, 0x0006, "BoundingBoxDepthInMicrometers" }, { 0x59, 0x0007, "LampArrayKind" }, { 0x59, 0x0008, "MinUpdateIntervalInMicroseconds" }, { 0x59, 0x0020, "LampAttributesRequestReport" }, { 0x59, 0x0021, "LampId" }, { 0x59, 0x0022, "LampAttributesResponseReport" }, { 0x59, 0x0023, "PositionXInMicrometers" }, { 0x59, 0x0024, "PositionYInMicrometers" }, { 0x59, 0x0025, "PositionZInMicrometers" }, { 0x59, 0x0026, "LampPurposes" }, { 0x59, 0x0027, "UpdateLatencyInMicroseconds" }, { 0x59, 0x0028, "RedLevelCount" }, { 0x59, 0x0029, "GreenLevelCount" }, { 0x59, 0x002a, "BlueLevelCount" }, { 0x59, 0x002b, "IntensityLevelCount" }, { 0x59, 0x002c, "IsProgrammable" }, { 0x59, 0x002d, "InputBinding" }, { 0x59, 0x0050, "LampMultiUpdateReport" }, { 0x59, 0x0051, "RedUpdateChannel" }, { 0x59, 0x0052, "GreenUpdateChannel" }, { 0x59, 0x0053, "BlueUpdateChannel" }, { 0x59, 0x0054, "IntensityUpdateChannel" }, { 0x59, 0x0055, "LampUpdateFlags" }, { 0x59, 0x0060, "LampRangeUpdateReport" }, { 0x59, 0x0061, "LampIdStart" }, { 0x59, 0x0062, "LampIdEnd" }, { 0x59, 0x0070, "LampArrayControlReport" }, { 0x59, 0x0071, "AutonomousMode" }, { 0x80, 0, "Monitor" }, { 0x80, 0x0001, "MonitorControl" }, { 0x80, 0x0002, "EDIDInformation" }, { 0x80, 0x0003, "VDIFInformation" }, { 0x80, 0x0004, "VESAVersion" }, { 0x81, 0, "MonitorEnumerated" }, { 0x82, 0, "VESAVirtualControls" }, { 0x82, 0x0001, "Degauss" }, { 0x82, 0x0010, "Brightness" }, { 0x82, 0x0012, "Contrast" }, { 0x82, 0x0016, "RedVideoGain" }, { 0x82, 0x0018, "GreenVideoGain" }, { 0x82, 0x001a, "BlueVideoGain" }, { 0x82, 0x001c, "Focus" }, { 0x82, 0x0020, "HorizontalPosition" }, { 0x82, 0x0022, "HorizontalSize" }, { 0x82, 0x0024, "HorizontalPincushion" }, { 0x82, 0x0026, "HorizontalPincushionBalance" }, { 0x82, 0x0028, "HorizontalMisconvergence" }, { 0x82, 0x002a, "HorizontalLinearity" }, { 0x82, 0x002c, "HorizontalLinearityBalance" }, { 0x82, 0x0030, "VerticalPosition" }, { 0x82, 0x0032, "VerticalSize" }, { 0x82, 0x0034, "VerticalPincushion" }, { 0x82, 0x0036, "VerticalPincushionBalance" }, { 0x82, 0x0038, "VerticalMisconvergence" }, { 0x82, 0x003a, "VerticalLinearity" }, { 0x82, 0x003c, "VerticalLinearityBalance" }, { 0x82, 0x0040, "ParallelogramDistortionKeyBalance" }, { 0x82, 0x0042, "TrapezoidalDistortionKey" }, { 0x82, 0x0044, "TiltRotation" }, { 0x82, 0x0046, "TopCornerDistortionControl" }, { 0x82, 0x0048, "TopCornerDistortionBalance" }, { 0x82, 0x004a, "BottomCornerDistortionControl" }, { 0x82, 0x004c, "BottomCornerDistortionBalance" }, { 0x82, 0x0056, "HorizontalMoire" }, { 0x82, 0x0058, "VerticalMoire" }, { 0x82, 0x005e, "InputLevelSelect" }, { 0x82, 0x0060, "InputSourceSelect" }, { 0x82, 0x006c, "RedVideoBlackLevel" }, { 0x82, 0x006e, "GreenVideoBlackLevel" }, { 0x82, 0x0070, "BlueVideoBlackLevel" }, { 0x82, 0x00a2, "AutoSizeCenter" }, { 0x82, 0x00a4, "PolarityHorizontalSynchronization" }, { 0x82, 0x00a6, "PolarityVerticalSynchronization" }, { 0x82, 0x00a8, "SynchronizationType" }, { 0x82, 0x00aa, "ScreenOrientation" }, { 0x82, 0x00ac, "HorizontalFrequency" }, { 0x82, 0x00ae, "VerticalFrequency" }, { 0x82, 0x00b0, "Settings" }, { 0x82, 0x00ca, "OnScreenDisplay" }, { 0x82, 0x00d4, "StereoMode" }, { 0x84, 0, "Power" }, { 0x84, 0x0001, "iName" }, { 0x84, 0x0002, "PresentStatus" }, { 0x84, 0x0003, "ChangedStatus" }, { 0x84, 0x0004, "UPS" }, { 0x84, 0x0005, "PowerSupply" }, { 0x84, 0x0010, "BatterySystem" }, { 0x84, 0x0011, "BatterySystemId" }, { 0x84, 0x0012, "Battery" }, { 0x84, 0x0013, "BatteryId" }, { 0x84, 0x0014, "Charger" }, { 0x84, 0x0015, "ChargerId" }, { 0x84, 0x0016, "PowerConverter" }, { 0x84, 0x0017, "PowerConverterId" }, { 0x84, 0x0018, "OutletSystem" }, { 0x84, 0x0019, "OutletSystemId" }, { 0x84, 0x001a, "Input" }, { 0x84, 0x001b, "InputId" }, { 0x84, 0x001c, "Output" }, { 0x84, 0x001d, "OutputId" }, { 0x84, 0x001e, "Flow" }, { 0x84, 0x001f, "FlowId" }, { 0x84, 0x0020, "Outlet" }, { 0x84, 0x0021, "OutletId" }, { 0x84, 0x0022, "Gang" }, { 0x84, 0x0023, "GangId" }, { 0x84, 0x0024, "PowerSummary" }, { 0x84, 0x0025, "PowerSummaryId" }, { 0x84, 0x0030, "Voltage" }, { 0x84, 0x0031, "Current" }, { 0x84, 0x0032, "Frequency" }, { 0x84, 0x0033, "ApparentPower" }, { 0x84, 0x0034, "ActivePower" }, { 0x84, 0x0035, "PercentLoad" }, { 0x84, 0x0036, "Temperature" }, { 0x84, 0x0037, "Humidity" }, { 0x84, 0x0038, "BadCount" }, { 0x84, 0x0040, "ConfigVoltage" }, { 0x84, 0x0041, "ConfigCurrent" }, { 0x84, 0x0042, "ConfigFrequency" }, { 0x84, 0x0043, "ConfigApparentPower" }, { 0x84, 0x0044, "ConfigActivePower" }, { 0x84, 0x0045, "ConfigPercentLoad" }, { 0x84, 0x0046, "ConfigTemperature" }, { 0x84, 0x0047, "ConfigHumidity" }, { 0x84, 0x0050, "SwitchOnControl" }, { 0x84, 0x0051, "SwitchOffControl" }, { 0x84, 0x0052, "ToggleControl" }, { 0x84, 0x0053, "LowVoltageTransfer" }, { 0x84, 0x0054, "HighVoltageTransfer" }, { 0x84, 0x0055, "DelayBeforeReboot" }, { 0x84, 0x0056, "DelayBeforeStartup" }, { 0x84, 0x0057, "DelayBeforeShutdown" }, { 0x84, 0x0058, "Test" }, { 0x84, 0x0059, "ModuleReset" }, { 0x84, 0x005a, "AudibleAlarmControl" }, { 0x84, 0x0060, "Present" }, { 0x84, 0x0061, "Good" }, { 0x84, 0x0062, "InternalFailure" }, { 0x84, 0x0063, "VoltagOutOfRange" }, { 0x84, 0x0064, "FrequencyOutOfRange" }, { 0x84, 0x0065, "Overload" }, { 0x84, 0x0066, "OverCharged" }, { 0x84, 0x0067, "OverTemperature" }, { 0x84, 0x0068, "ShutdownRequested" }, { 0x84, 0x0069, "ShutdownImminent" }, { 0x84, 0x006b, "SwitchOnOff" }, { 0x84, 0x006c, "Switchable" }, { 0x84, 0x006d, "Used" }, { 0x84, 0x006e, "Boost" }, { 0x84, 0x006f, "Buck" }, { 0x84, 0x0070, "Initialized" }, { 0x84, 0x0071, "Tested" }, { 0x84, 0x0072, "AwaitingPower" }, { 0x84, 0x0073, "CommunicationLost" }, { 0x84, 0x00fd, "iManufacturer" }, { 0x84, 0x00fe, "iProduct" }, { 0x84, 0x00ff, "iSerialNumber" }, { 0x85, 0, "BatterySystem" }, { 0x85, 0x0001, "SmartBatteryBatteryMode" }, { 0x85, 0x0002, "SmartBatteryBatteryStatus" }, { 0x85, 0x0003, "SmartBatteryAlarmWarning" }, { 0x85, 0x0004, "SmartBatteryChargerMode" }, { 0x85, 0x0005, "SmartBatteryChargerStatus" }, { 0x85, 0x0006, "SmartBatteryChargerSpecInfo" }, { 0x85, 0x0007, "SmartBatterySelectorState" }, { 0x85, 0x0008, "SmartBatterySelectorPresets" }, { 0x85, 0x0009, "SmartBatterySelectorInfo" }, { 0x85, 0x0010, "OptionalMfgFunction1" }, { 0x85, 0x0011, "OptionalMfgFunction2" }, { 0x85, 0x0012, "OptionalMfgFunction3" }, { 0x85, 0x0013, "OptionalMfgFunction4" }, { 0x85, 0x0014, "OptionalMfgFunction5" }, { 0x85, 0x0015, "ConnectionToSMBus" }, { 0x85, 0x0016, "OutputConnection" }, { 0x85, 0x0017, "ChargerConnection" }, { 0x85, 0x0018, "BatteryInsertion" }, { 0x85, 0x0019, "UseNext" }, { 0x85, 0x001a, "OKToUse" }, { 0x85, 0x001b, "BatterySupported" }, { 0x85, 0x001c, "SelectorRevision" }, { 0x85, 0x001d, "ChargingIndicator" }, { 0x85, 0x0028, "ManufacturerAccess" }, { 0x85, 0x0029, "RemainingCapacityLimit" }, { 0x85, 0x002a, "RemainingTimeLimit" }, { 0x85, 0x002b, "AtRate" }, { 0x85, 0x002c, "CapacityMode" }, { 0x85, 0x002d, "BroadcastToCharger" }, { 0x85, 0x002e, "PrimaryBattery" }, { 0x85, 0x002f, "ChargeController" }, { 0x85, 0x0040, "TerminateCharge" }, { 0x85, 0x0041, "TerminateDischarge" }, { 0x85, 0x0042, "BelowRemainingCapacityLimit" }, { 0x85, 0x0043, "RemainingTimeLimitExpired" }, { 0x85, 0x0044, "Charging" }, { 0x85, 0x0045, "Discharging" }, { 0x85, 0x0046, "FullyCharged" }, { 0x85, 0x0047, "FullyDischarged" }, { 0x85, 0x0048, "ConditioningFlag" }, { 0x85, 0x0049, "AtRateOK" }, { 0x85, 0x004a, "SmartBatteryErrorCode" }, { 0x85, 0x004b, "NeedReplacement" }, { 0x85, 0x0060, "AtRateTimeToFull" }, { 0x85, 0x0061, "AtRateTimeToEmpty" }, { 0x85, 0x0062, "AverageCurrent" }, { 0x85, 0x0063, "MaxError" }, { 0x85, 0x0064, "RelativeStateOfCharge" }, { 0x85, 0x0065, "AbsoluteStateOfCharge" }, { 0x85, 0x0066, "RemainingCapacity" }, { 0x85, 0x0067, "FullChargeCapacity" }, { 0x85, 0x0068, "RunTimeToEmpty" }, { 0x85, 0x0069, "AverageTimeToEmpty" }, { 0x85, 0x006a, "AverageTimeToFull" }, { 0x85, 0x006b, "CycleCount" }, { 0x85, 0x0080, "BatteryPackModelLevel" }, { 0x85, 0x0081, "InternalChargeController" }, { 0x85, 0x0082, "PrimaryBatterySupport" }, { 0x85, 0x0083, "DesignCapacity" }, { 0x85, 0x0084, "SpecificationInfo" }, { 0x85, 0x0085, "ManufactureDate" }, { 0x85, 0x0086, "SerialNumber" }, { 0x85, 0x0087, "iManufacturerName" }, { 0x85, 0x0088, "iDeviceName" }, { 0x85, 0x0089, "iDeviceChemistry" }, { 0x85, 0x008a, "ManufacturerData" }, { 0x85, 0x008b, "Rechargeable" }, { 0x85, 0x008c, "WarningCapacityLimit" }, { 0x85, 0x008d, "CapacityGranularity1" }, { 0x85, 0x008e, "CapacityGranularity2" }, { 0x85, 0x008f, "iOEMInformation" }, { 0x85, 0x00c0, "InhibitCharge" }, { 0x85, 0x00c1, "EnablePolling" }, { 0x85, 0x00c2, "ResetToZero" }, { 0x85, 0x00d0, "ACPresent" }, { 0x85, 0x00d1, "BatteryPresent" }, { 0x85, 0x00d2, "PowerFail" }, { 0x85, 0x00d3, "AlarmInhibited" }, { 0x85, 0x00d4, "ThermistorUnderRange" }, { 0x85, 0x00d5, "ThermistorHot" }, { 0x85, 0x00d6, "ThermistorCold" }, { 0x85, 0x00d7, "ThermistorOverRange" }, { 0x85, 0x00d8, "VoltageOutOfRange" }, { 0x85, 0x00d9, "CurrentOutOfRange" }, { 0x85, 0x00da, "CurrentNotRegulated" }, { 0x85, 0x00db, "VoltageNotRegulated" }, { 0x85, 0x00dc, "MasterMode" }, { 0x85, 0x00f0, "ChargerSelectorSupport" }, { 0x85, 0x00f1, "ChargerSpec" }, { 0x85, 0x00f2, "Level2" }, { 0x85, 0x00f3, "Level3" }, { 0x8c, 0, "BarcodeScanner" }, { 0x8c, 0x0001, "BarcodeBadgeReader" }, { 0x8c, 0x0002, "BarcodeScanner" }, { 0x8c, 0x0003, "DumbBarCodeScanner" }, { 0x8c, 0x0004, "CordlessScannerBase" }, { 0x8c, 0x0005, "BarCodeScannerCradle" }, { 0x8c, 0x0010, "AttributeReport" }, { 0x8c, 0x0011, "SettingsReport" }, { 0x8c, 0x0012, "ScannedDataReport" }, { 0x8c, 0x0013, "RawScannedDataReport" }, { 0x8c, 0x0014, "TriggerReport" }, { 0x8c, 0x0015, "StatusReport" }, { 0x8c, 0x0016, "UPCEANControlReport" }, { 0x8c, 0x0017, "EAN23LabelControlReport" }, { 0x8c, 0x0018, "Code39ControlReport" }, { 0x8c, 0x0019, "Interleaved2of5ControlReport" }, { 0x8c, 0x001a, "Standard2of5ControlReport" }, { 0x8c, 0x001b, "MSIPlesseyControlReport" }, { 0x8c, 0x001c, "CodabarControlReport" }, { 0x8c, 0x001d, "Code128ControlReport" }, { 0x8c, 0x001e, "Misc1DControlReport" }, { 0x8c, 0x001f, "2DControlReport" }, { 0x8c, 0x0030, "AimingPointerMode" }, { 0x8c, 0x0031, "BarCodePresentSensor" }, { 0x8c, 0x0032, "Class1ALaser" }, { 0x8c, 0x0033, "Class2Laser" }, { 0x8c, 0x0034, "HeaterPresent" }, { 0x8c, 0x0035, "ContactScanner" }, { 0x8c, 0x0036, "ElectronicArticleSurveillanceNotification" }, { 0x8c, 0x0037, "ConstantElectronicArticleSurveillance" }, { 0x8c, 0x0038, "ErrorIndication" }, { 0x8c, 0x0039, "FixedBeeper" }, { 0x8c, 0x003a, "GoodDecodeIndication" }, { 0x8c, 0x003b, "HandsFreeScanning" }, { 0x8c, 0x003c, "IntrinsicallySafe" }, { 0x8c, 0x003d, "KlasseEinsLaser" }, { 0x8c, 0x003e, "LongRangeScanner" }, { 0x8c, 0x003f, "MirrorSpeedControl" }, { 0x8c, 0x0040, "NotOnFileIndication" }, { 0x8c, 0x0041, "ProgrammableBeeper" }, { 0x8c, 0x0042, "Triggerless" }, { 0x8c, 0x0043, "Wand" }, { 0x8c, 0x0044, "WaterResistant" }, { 0x8c, 0x0045, "MultiRangeScanner" }, { 0x8c, 0x0046, "ProximitySensor" }, { 0x8c, 0x004d, "FragmentDecoding" }, { 0x8c, 0x004e, "ScannerReadConfidence" }, { 0x8c, 0x004f, "DataPrefix" }, { 0x8c, 0x0050, "PrefixAIMI" }, { 0x8c, 0x0051, "PrefixNone" }, { 0x8c, 0x0052, "PrefixProprietary" }, { 0x8c, 0x0055, "ActiveTime" }, { 0x8c, 0x0056, "AimingLaserPattern" }, { 0x8c, 0x0057, "BarCodePresent" }, { 0x8c, 0x0058, "BeeperState" }, { 0x8c, 0x0059, "LaserOnTime" }, { 0x8c, 0x005a, "LaserState" }, { 0x8c, 0x005b, "LockoutTime" }, { 0x8c, 0x005c, "MotorState" }, { 0x8c, 0x005d, "MotorTimeout" }, { 0x8c, 0x005e, "PowerOnResetScanner" }, { 0x8c, 0x005f, "PreventReadofBarcodes" }, { 0x8c, 0x0060, "InitiateBarcodeRead" }, { 0x8c, 0x0061, "TriggerState" }, { 0x8c, 0x0062, "TriggerMode" }, { 0x8c, 0x0063, "TriggerModeBlinkingLaserOn" }, { 0x8c, 0x0064, "TriggerModeContinuousLaserOn" }, { 0x8c, 0x0065, "TriggerModeLaseronwhilePulled" }, { 0x8c, 0x0066, "TriggerModeLaserstaysonafterrelease" }, { 0x8c, 0x006d, "CommitParameterstoNVM" }, { 0x8c, 0x006e, "ParameterScanning" }, { 0x8c, 0x006f, "ParametersChanged" }, { 0x8c, 0x0070, "Setparameterdefaultvalues" }, { 0x8c, 0x0075, "ScannerInCradle" }, { 0x8c, 0x0076, "ScannerInRange" }, { 0x8c, 0x007a, "AimDuration" }, { 0x8c, 0x007b, "GoodReadLampDuration" }, { 0x8c, 0x007c, "GoodReadLampIntensity" }, { 0x8c, 0x007d, "GoodReadLED" }, { 0x8c, 0x007e, "GoodReadToneFrequency" }, { 0x8c, 0x007f, "GoodReadToneLength" }, { 0x8c, 0x0080, "GoodReadToneVolume" }, { 0x8c, 0x0082, "NoReadMessage" }, { 0x8c, 0x0083, "NotonFileVolume" }, { 0x8c, 0x0084, "PowerupBeep" }, { 0x8c, 0x0085, "SoundErrorBeep" }, { 0x8c, 0x0086, "SoundGoodReadBeep" }, { 0x8c, 0x0087, "SoundNotOnFileBeep" }, { 0x8c, 0x0088, "GoodReadWhentoWrite" }, { 0x8c, 0x0089, "GRWTIAfterDecode" }, { 0x8c, 0x008a, "GRWTIBeepLampaftertransmit" }, { 0x8c, 0x008b, "GRWTINoBeepLampuseatall" }, { 0x8c, 0x0091, "BooklandEAN" }, { 0x8c, 0x0092, "ConvertEAN8to13Type" }, { 0x8c, 0x0093, "ConvertUPCAtoEAN13" }, { 0x8c, 0x0094, "ConvertUPCEtoA" }, { 0x8c, 0x0095, "EAN13" }, { 0x8c, 0x0096, "EAN8" }, { 0x8c, 0x0097, "EAN99128Mandatory" }, { 0x8c, 0x0098, "EAN99P5128Optional" }, { 0x8c, 0x0099, "EnableEANTwoLabel" }, { 0x8c, 0x009a, "UPCEAN" }, { 0x8c, 0x009b, "UPCEANCouponCode" }, { 0x8c, 0x009c, "UPCEANPeriodicals" }, { 0x8c, 0x009d, "UPCA" }, { 0x8c, 0x009e, "UPCAwith128Mandatory" }, { 0x8c, 0x009f, "UPCAwith128Optional" }, { 0x8c, 0x00a0, "UPCAwithP5Optional" }, { 0x8c, 0x00a1, "UPCE" }, { 0x8c, 0x00a2, "UPCE1" }, { 0x8c, 0x00a9, "Periodical" }, { 0x8c, 0x00aa, "PeriodicalAutoDiscriminate2" }, { 0x8c, 0x00ab, "PeriodicalOnlyDecodewith2" }, { 0x8c, 0x00ac, "PeriodicalIgnore2" }, { 0x8c, 0x00ad, "PeriodicalAutoDiscriminate5" }, { 0x8c, 0x00ae, "PeriodicalOnlyDecodewith5" }, { 0x8c, 0x00af, "PeriodicalIgnore5" }, { 0x8c, 0x00b0, "Check" }, { 0x8c, 0x00b1, "CheckDisablePrice" }, { 0x8c, 0x00b2, "CheckEnable4digitPrice" }, { 0x8c, 0x00b3, "CheckEnable5digitPrice" }, { 0x8c, 0x00b4, "CheckEnableEuropean4digitPrice" }, { 0x8c, 0x00b5, "CheckEnableEuropean5digitPrice" }, { 0x8c, 0x00b7, "EANTwoLabel" }, { 0x8c, 0x00b8, "EANThreeLabel" }, { 0x8c, 0x00b9, "EAN8FlagDigit1" }, { 0x8c, 0x00ba, "EAN8FlagDigit2" }, { 0x8c, 0x00bb, "EAN8FlagDigit3" }, { 0x8c, 0x00bc, "EAN13FlagDigit1" }, { 0x8c, 0x00bd, "EAN13FlagDigit2" }, { 0x8c, 0x00be, "EAN13FlagDigit3" }, { 0x8c, 0x00bf, "AddEAN23LabelDefinition" }, { 0x8c, 0x00c0, "ClearallEAN23LabelDefinitions" }, { 0x8c, 0x00c3, "Codabar" }, { 0x8c, 0x00c4, "Code128" }, { 0x8c, 0x00c7, "Code39" }, { 0x8c, 0x00c8, "Code93" }, { 0x8c, 0x00c9, "FullASCIIConversion" }, { 0x8c, 0x00ca, "Interleaved2of5" }, { 0x8c, 0x00cb, "ItalianPharmacyCode" }, { 0x8c, 0x00cc, "MSIPlessey" }, { 0x8c, 0x00cd, "Standard2of5IATA" }, { 0x8c, 0x00ce, "Standard2of5" }, { 0x8c, 0x00d3, "TransmitStartStop" }, { 0x8c, 0x00d4, "TriOptic" }, { 0x8c, 0x00d5, "UCCEAN128" }, { 0x8c, 0x00d6, "CheckDigit" }, { 0x8c, 0x00d7, "CheckDigitDisable" }, { 0x8c, 0x00d8, "CheckDigitEnableInterleaved2of5OPCC" }, { 0x8c, 0x00d9, "CheckDigitEnableInterleaved2of5USS" }, { 0x8c, 0x00da, "CheckDigitEnableStandard2of5OPCC" }, { 0x8c, 0x00db, "CheckDigitEnableStandard2of5USS" }, { 0x8c, 0x00dc, "CheckDigitEnableOneMSIPlessey" }, { 0x8c, 0x00dd, "CheckDigitEnableTwoMSIPlessey" }, { 0x8c, 0x00de, "CheckDigitCodabarEnable" }, { 0x8c, 0x00df, "CheckDigitCode39Enable" }, { 0x8c, 0x00f0, "TransmitCheckDigit" }, { 0x8c, 0x00f1, "DisableCheckDigitTransmit" }, { 0x8c, 0x00f2, "EnableCheckDigitTransmit" }, { 0x8c, 0x00fb, "SymbologyIdentifier1" }, { 0x8c, 0x00fc, "SymbologyIdentifier2" }, { 0x8c, 0x00fd, "SymbologyIdentifier3" }, { 0x8c, 0x00fe, "DecodedData" }, { 0x8c, 0x00ff, "DecodeDataContinued" }, { 0x8c, 0x0100, "BarSpaceData" }, { 0x8c, 0x0101, "ScannerDataAccuracy" }, { 0x8c, 0x0102, "RawDataPolarity" }, { 0x8c, 0x0103, "PolarityInvertedBarCode" }, { 0x8c, 0x0104, "PolarityNormalBarCode" }, { 0x8c, 0x0106, "MinimumLengthtoDecode" }, { 0x8c, 0x0107, "MaximumLengthtoDecode" }, { 0x8c, 0x0108, "DiscreteLengthtoDecode1" }, { 0x8c, 0x0109, "DiscreteLengthtoDecode2" }, { 0x8c, 0x010a, "DataLengthMethod" }, { 0x8c, 0x010b, "DLMethodReadany" }, { 0x8c, 0x010c, "DLMethodCheckinRange" }, { 0x8c, 0x010d, "DLMethodCheckforDiscrete" }, { 0x8c, 0x0110, "AztecCode" }, { 0x8c, 0x0111, "BC412" }, { 0x8c, 0x0112, "ChannelCode" }, { 0x8c, 0x0113, "Code16" }, { 0x8c, 0x0114, "Code32" }, { 0x8c, 0x0115, "Code49" }, { 0x8c, 0x0116, "CodeOne" }, { 0x8c, 0x0117, "Colorcode" }, { 0x8c, 0x0118, "DataMatrix" }, { 0x8c, 0x0119, "MaxiCode" }, { 0x8c, 0x011a, "MicroPDF" }, { 0x8c, 0x011b, "PDF417" }, { 0x8c, 0x011c, "PosiCode" }, { 0x8c, 0x011d, "QRCode" }, { 0x8c, 0x011e, "SuperCode" }, { 0x8c, 0x011f, "UltraCode" }, { 0x8c, 0x0120, "USD5SlugCode" }, { 0x8c, 0x0121, "VeriCode" }, { 0x8d, 0, "Scales" }, { 0x8d, 0x0001, "Scales" }, { 0x8d, 0x0020, "ScaleDevice" }, { 0x8d, 0x0021, "ScaleClass" }, { 0x8d, 0x0022, "ScaleClassIMetric" }, { 0x8d, 0x0023, "ScaleClassIIMetric" }, { 0x8d, 0x0024, "ScaleClassIIIMetric" }, { 0x8d, 0x0025, "ScaleClassIIILMetric" }, { 0x8d, 0x0026, "ScaleClassIVMetric" }, { 0x8d, 0x0027, "ScaleClassIIIEnglish" }, { 0x8d, 0x0028, "ScaleClassIIILEnglish" }, { 0x8d, 0x0029, "ScaleClassIVEnglish" }, { 0x8d, 0x002a, "ScaleClassGeneric" }, { 0x8d, 0x0030, "ScaleAttributeReport" }, { 0x8d, 0x0031, "ScaleControlReport" }, { 0x8d, 0x0032, "ScaleDataReport" }, { 0x8d, 0x0033, "ScaleStatusReport" }, { 0x8d, 0x0034, "ScaleWeightLimitReport" }, { 0x8d, 0x0035, "ScaleStatisticsReport" }, { 0x8d, 0x0040, "DataWeight" }, { 0x8d, 0x0041, "DataScaling" }, { 0x8d, 0x0050, "WeightUnit" }, { 0x8d, 0x0051, "WeightUnitMilligram" }, { 0x8d, 0x0052, "WeightUnitGram" }, { 0x8d, 0x0053, "WeightUnitKilogram" }, { 0x8d, 0x0054, "WeightUnitCarats" }, { 0x8d, 0x0055, "WeightUnitTaels" }, { 0x8d, 0x0056, "WeightUnitGrains" }, { 0x8d, 0x0057, "WeightUnitPennyweights" }, { 0x8d, 0x0058, "WeightUnitMetricTon" }, { 0x8d, 0x0059, "WeightUnitAvoirTon" }, { 0x8d, 0x005a, "WeightUnitTroyOunce" }, { 0x8d, 0x005b, "WeightUnitOunce" }, { 0x8d, 0x005c, "WeightUnitPound" }, { 0x8d, 0x0060, "CalibrationCount" }, { 0x8d, 0x0061, "ReZeroCount" }, { 0x8d, 0x0070, "ScaleStatus" }, { 0x8d, 0x0071, "ScaleStatusFault" }, { 0x8d, 0x0072, "ScaleStatusStableatCenterofZero" }, { 0x8d, 0x0073, "ScaleStatusInMotion" }, { 0x8d, 0x0074, "ScaleStatusWeightStable" }, { 0x8d, 0x0075, "ScaleStatusUnderZero" }, { 0x8d, 0x0076, "ScaleStatusOverWeightLimit" }, { 0x8d, 0x0077, "ScaleStatusRequiresCalibration" }, { 0x8d, 0x0078, "ScaleStatusRequiresRezeroing" }, { 0x8d, 0x0080, "ZeroScale" }, { 0x8d, 0x0081, "EnforcedZeroReturn" }, { 0x8e, 0, "MagneticStripeReader" }, { 0x8e, 0x0001, "MSRDeviceReadOnly" }, { 0x8e, 0x0011, "Track1Length" }, { 0x8e, 0x0012, "Track2Length" }, { 0x8e, 0x0013, "Track3Length" }, { 0x8e, 0x0014, "TrackJISLength" }, { 0x8e, 0x0020, "TrackData" }, { 0x8e, 0x0021, "Track1Data" }, { 0x8e, 0x0022, "Track2Data" }, { 0x8e, 0x0023, "Track3Data" }, { 0x8e, 0x0024, "TrackJISData" }, { 0x90, 0, "CameraControl" }, { 0x90, 0x0020, "CameraAutofocus" }, { 0x90, 0x0021, "CameraShutter" }, { 0x91, 0, "Arcade" }, { 0x91, 0x0001, "GeneralPurposeIOCard" }, { 0x91, 0x0002, "CoinDoor" }, { 0x91, 0x0003, "WatchdogTimer" }, { 0x91, 0x0030, "GeneralPurposeAnalogInputState" }, { 0x91, 0x0031, "GeneralPurposeDigitalInputState" }, { 0x91, 0x0032, "GeneralPurposeOpticalInputState" }, { 0x91, 0x0033, "GeneralPurposeDigitalOutputState" }, { 0x91, 0x0034, "NumberofCoinDoors" }, { 0x91, 0x0035, "CoinDrawerDropCount" }, { 0x91, 0x0036, "CoinDrawerStart" }, { 0x91, 0x0037, "CoinDrawerService" }, { 0x91, 0x0038, "CoinDrawerTilt" }, { 0x91, 0x0039, "CoinDoorTest" }, { 0x91, 0x0040, "CoinDoorLockout" }, { 0x91, 0x0041, "WatchdogTimeout" }, { 0x91, 0x0042, "WatchdogAction" }, { 0x91, 0x0043, "WatchdogReboot" }, { 0x91, 0x0044, "WatchdogRestart" }, { 0x91, 0x0045, "AlarmInput" }, { 0x91, 0x0046, "CoinDoorCounter" }, { 0x91, 0x0047, "IODirectionMapping" }, { 0x91, 0x0048, "SetIODirectionMapping" }, { 0x91, 0x0049, "ExtendedOpticalInputState" }, { 0x91, 0x004a, "PinPadInputState" }, { 0x91, 0x004b, "PinPadStatus" }, { 0x91, 0x004c, "PinPadOutput" }, { 0x91, 0x004d, "PinPadCommand" }, { 0xf1d0, 0, "FIDOAlliance" }, { 0xf1d0, 0x0001, "U2FAuthenticatorDevice" }, { 0xf1d0, 0x0020, "InputReportData" }, { 0xf1d0, 0x0021, "OutputReportData" }, /* pages 0xff00 to 0xffff are vendor-specific */ { 0xffff, 0, "Vendor-specific-FF" }, { 0, 0, NULL } }; /* Either output directly into simple seq_file, or (if f == NULL) * allocate a separate buffer that will then be passed to the 'events' * ringbuffer. * * This is because these functions can be called both for "one-shot" * "rdesc" while resolving, or for blocking "events". * * This holds both for resolv_usage_page() and hid_resolv_usage(). */ static char *resolv_usage_page(unsigned page, struct seq_file *f) { const struct hid_usage_entry *p; char *buf = NULL; if (!f) { buf = kzalloc(HID_DEBUG_BUFSIZE, GFP_ATOMIC); if (!buf) return ERR_PTR(-ENOMEM); } for (p = hid_usage_table; p->description; p++) if (p->page == page) { if (!f) { snprintf(buf, HID_DEBUG_BUFSIZE, "%s", p->description); return buf; } else { seq_printf(f, "%s", p->description); return NULL; } } if (!f) snprintf(buf, HID_DEBUG_BUFSIZE, "%04x", page); else seq_printf(f, "%04x", page); return buf; } char *hid_resolv_usage(unsigned usage, struct seq_file *f) { const struct hid_usage_entry *p; const struct hid_usage_entry *m; char *buf = NULL; int len = 0; const char *modifier = NULL; unsigned int usage_modifier = usage & 0xF000; unsigned int usage_actual = usage & 0xFFFF; buf = resolv_usage_page(usage >> 16, f); if (IS_ERR(buf)) { pr_err("error allocating HID debug buffer\n"); return NULL; } if (!f) { len = strlen(buf); len += scnprintf(buf + len, HID_DEBUG_BUFSIZE - len, "."); } else { seq_printf(f, "."); } for (p = hid_usage_table; p->description; p++) if (p->page == (usage >> 16)) { if (p->page == 0x20 && usage_modifier) { for (m = p; m->description; m++) { if (p->page == m->page && m->usage == usage_modifier) { modifier = m->description; break; } } if (modifier) usage_actual = usage_actual & 0x0FFF; } if (!modifier) modifier = ""; for(++p; p->description && p->usage != 0; p++) if (p->usage == usage_actual) { if (!f) snprintf(buf + len, HID_DEBUG_BUFSIZE - len, "%s%s", p->description, modifier); else seq_printf(f, "%s%s", p->description, modifier); return buf; } break; } if (!f) snprintf(buf + len, HID_DEBUG_BUFSIZE - len, "%04x", usage & 0xffff); else seq_printf(f, "%04x", usage & 0xffff); return buf; } EXPORT_SYMBOL_GPL(hid_resolv_usage); static void tab(int n, struct seq_file *f) { seq_printf(f, "%*s", n, ""); } void hid_dump_field(struct hid_field *field, int n, struct seq_file *f) { int j; if (field->physical) { tab(n, f); seq_printf(f, "Physical("); hid_resolv_usage(field->physical, f); seq_printf(f, ")\n"); } if (field->logical) { tab(n, f); seq_printf(f, "Logical("); hid_resolv_usage(field->logical, f); seq_printf(f, ")\n"); } if (field->application) { tab(n, f); seq_printf(f, "Application("); hid_resolv_usage(field->application, f); seq_printf(f, ")\n"); } tab(n, f); seq_printf(f, "Usage(%d)\n", field->maxusage); for (j = 0; j < field->maxusage; j++) { tab(n+2, f); hid_resolv_usage(field->usage[j].hid, f); seq_printf(f, "\n"); } if (field->logical_minimum != field->logical_maximum) { tab(n, f); seq_printf(f, "Logical Minimum(%d)\n", field->logical_minimum); tab(n, f); seq_printf(f, "Logical Maximum(%d)\n", field->logical_maximum); } if (field->physical_minimum != field->physical_maximum) { tab(n, f); seq_printf(f, "Physical Minimum(%d)\n", field->physical_minimum); tab(n, f); seq_printf(f, "Physical Maximum(%d)\n", field->physical_maximum); } if (field->unit_exponent) { tab(n, f); seq_printf(f, "Unit Exponent(%d)\n", field->unit_exponent); } if (field->unit) { static const char *systems[5] = { "None", "SI Linear", "SI Rotation", "English Linear", "English Rotation" }; static const char *units[5][8] = { { "None", "None", "None", "None", "None", "None", "None", "None" }, { "None", "Centimeter", "Gram", "Seconds", "Kelvin", "Ampere", "Candela", "None" }, { "None", "Radians", "Gram", "Seconds", "Kelvin", "Ampere", "Candela", "None" }, { "None", "Inch", "Slug", "Seconds", "Fahrenheit", "Ampere", "Candela", "None" }, { "None", "Degrees", "Slug", "Seconds", "Fahrenheit", "Ampere", "Candela", "None" } }; int i; int sys; __u32 data = field->unit; /* First nibble tells us which system we're in. */ sys = data & 0xf; data >>= 4; if(sys > 4) { tab(n, f); seq_printf(f, "Unit(Invalid)\n"); } else { int earlier_unit = 0; tab(n, f); seq_printf(f, "Unit(%s : ", systems[sys]); for (i=1 ; i<sizeof(__u32)*2 ; i++) { char nibble = data & 0xf; data >>= 4; if (nibble != 0) { if(earlier_unit++ > 0) seq_printf(f, "*"); seq_printf(f, "%s", units[sys][i]); if(nibble != 1) { /* This is a _signed_ nibble(!) */ int val = nibble & 0x7; if(nibble & 0x08) val = -((0x7 & ~val) +1); seq_printf(f, "^%d", val); } } } seq_printf(f, ")\n"); } } tab(n, f); seq_printf(f, "Report Size(%u)\n", field->report_size); tab(n, f); seq_printf(f, "Report Count(%u)\n", field->report_count); tab(n, f); seq_printf(f, "Report Offset(%u)\n", field->report_offset); tab(n, f); seq_printf(f, "Flags( "); j = field->flags; seq_printf(f, "%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array "); seq_printf(f, "%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute "); seq_printf(f, "%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : ""); seq_printf(f, "%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : ""); seq_printf(f, ")\n"); } EXPORT_SYMBOL_GPL(hid_dump_field); void hid_dump_device(struct hid_device *device, struct seq_file *f) { struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; unsigned i,k; static const char *table[] = {"INPUT", "OUTPUT", "FEATURE"}; for (i = 0; i < HID_REPORT_TYPES; i++) { report_enum = device->report_enum + i; list = report_enum->report_list.next; while (list != &report_enum->report_list) { report = (struct hid_report *) list; tab(2, f); seq_printf(f, "%s", table[i]); if (report->id) seq_printf(f, "(%d)", report->id); seq_printf(f, "[%s]", table[report->type]); seq_printf(f, "\n"); for (k = 0; k < report->maxfield; k++) { tab(4, f); seq_printf(f, "Field(%d)\n", k); hid_dump_field(report->field[k], 6, f); } list = list->next; } } } EXPORT_SYMBOL_GPL(hid_dump_device); /* enqueue string to 'events' ring buffer */ void hid_debug_event(struct hid_device *hdev, char *buf) { struct hid_debug_list *list; unsigned long flags; spin_lock_irqsave(&hdev->debug_list_lock, flags); list_for_each_entry(list, &hdev->debug_list, node) kfifo_in(&list->hid_debug_fifo, buf, strlen(buf)); spin_unlock_irqrestore(&hdev->debug_list_lock, flags); wake_up_interruptible(&hdev->debug_wait); } EXPORT_SYMBOL_GPL(hid_debug_event); void hid_dump_report(struct hid_device *hid, int type, u8 *data, int size) { struct hid_report_enum *report_enum; char *buf; unsigned int i; buf = kmalloc(HID_DEBUG_BUFSIZE, GFP_ATOMIC); if (!buf) return; report_enum = hid->report_enum + type; /* dump the report */ snprintf(buf, HID_DEBUG_BUFSIZE - 1, "\nreport (size %u) (%snumbered) = ", size, report_enum->numbered ? "" : "un"); hid_debug_event(hid, buf); for (i = 0; i < size; i++) { snprintf(buf, HID_DEBUG_BUFSIZE - 1, " %02x", data[i]); hid_debug_event(hid, buf); } hid_debug_event(hid, "\n"); kfree(buf); } EXPORT_SYMBOL_GPL(hid_dump_report); void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 value) { char *buf; int len; buf = hid_resolv_usage(usage->hid, NULL); if (!buf) return; len = strlen(buf); snprintf(buf + len, HID_DEBUG_BUFSIZE - len - 1, " = %d\n", value); hid_debug_event(hdev, buf); kfree(buf); wake_up_interruptible(&hdev->debug_wait); } EXPORT_SYMBOL_GPL(hid_dump_input); static const char *events[EV_MAX + 1] = { [EV_SYN] = "Sync", [EV_KEY] = "Key", [EV_REL] = "Relative", [EV_ABS] = "Absolute", [EV_MSC] = "Misc", [EV_LED] = "LED", [EV_SND] = "Sound", [EV_REP] = "Repeat", [EV_FF] = "ForceFeedback", [EV_PWR] = "Power", [EV_FF_STATUS] = "ForceFeedbackStatus", [EV_SW] = "Software", }; static const char *syncs[SYN_CNT] = { [SYN_REPORT] = "Report", [SYN_CONFIG] = "Config", [SYN_MT_REPORT] = "MT Report", [SYN_DROPPED] = "Dropped", }; static const char *keys[KEY_MAX + 1] = { [KEY_RESERVED] = "Reserved", [KEY_ESC] = "Esc", [KEY_1] = "1", [KEY_2] = "2", [KEY_3] = "3", [KEY_4] = "4", [KEY_5] = "5", [KEY_6] = "6", [KEY_7] = "7", [KEY_8] = "8", [KEY_9] = "9", [KEY_0] = "0", [KEY_MINUS] = "Minus", [KEY_EQUAL] = "Equal", [KEY_BACKSPACE] = "Backspace", [KEY_TAB] = "Tab", [KEY_Q] = "Q", [KEY_W] = "W", [KEY_E] = "E", [KEY_R] = "R", [KEY_T] = "T", [KEY_Y] = "Y", [KEY_U] = "U", [KEY_I] = "I", [KEY_O] = "O", [KEY_P] = "P", [KEY_LEFTBRACE] = "LeftBrace", [KEY_RIGHTBRACE] = "RightBrace", [KEY_ENTER] = "Enter", [KEY_LEFTCTRL] = "LeftControl", [KEY_A] = "A", [KEY_S] = "S", [KEY_D] = "D", [KEY_F] = "F", [KEY_G] = "G", [KEY_H] = "H", [KEY_J] = "J", [KEY_K] = "K", [KEY_L] = "L", [KEY_SEMICOLON] = "Semicolon", [KEY_APOSTROPHE] = "Apostrophe", [KEY_GRAVE] = "Grave", [KEY_LEFTSHIFT] = "LeftShift", [KEY_BACKSLASH] = "BackSlash", [KEY_Z] = "Z", [KEY_X] = "X", [KEY_C] = "C", [KEY_V] = "V", [KEY_B] = "B", [KEY_N] = "N", [KEY_M] = "M", [KEY_COMMA] = "Comma", [KEY_DOT] = "Dot", [KEY_SLASH] = "Slash", [KEY_RIGHTSHIFT] = "RightShift", [KEY_KPASTERISK] = "KPAsterisk", [KEY_LEFTALT] = "LeftAlt", [KEY_SPACE] = "Space", [KEY_CAPSLOCK] = "CapsLock", [KEY_F1] = "F1", [KEY_F2] = "F2", [KEY_F3] = "F3", [KEY_F4] = "F4", [KEY_F5] = "F5", [KEY_F6] = "F6", [KEY_F7] = "F7", [KEY_F8] = "F8", [KEY_F9] = "F9", [KEY_F10] = "F10", [KEY_NUMLOCK] = "NumLock", [KEY_SCROLLLOCK] = "ScrollLock", [KEY_KP7] = "KP7", [KEY_KP8] = "KP8", [KEY_KP9] = "KP9", [KEY_KPMINUS] = "KPMinus", [KEY_KP4] = "KP4", [KEY_KP5] = "KP5", [KEY_KP6] = "KP6", [KEY_KPPLUS] = "KPPlus", [KEY_KP1] = "KP1", [KEY_KP2] = "KP2", [KEY_KP3] = "KP3", [KEY_KP0] = "KP0", [KEY_KPDOT] = "KPDot", [KEY_ZENKAKUHANKAKU] = "Zenkaku/Hankaku", [KEY_102ND] = "102nd", [KEY_F11] = "F11", [KEY_F12] = "F12", [KEY_RO] = "RO", [KEY_KATAKANA] = "Katakana", [KEY_HIRAGANA] = "HIRAGANA", [KEY_HENKAN] = "Henkan", [KEY_KATAKANAHIRAGANA] = "Katakana/Hiragana", [KEY_MUHENKAN] = "Muhenkan", [KEY_KPJPCOMMA] = "KPJpComma", [KEY_KPENTER] = "KPEnter", [KEY_RIGHTCTRL] = "RightCtrl", [KEY_KPSLASH] = "KPSlash", [KEY_SYSRQ] = "SysRq", [KEY_RIGHTALT] = "RightAlt", [KEY_LINEFEED] = "LineFeed", [KEY_HOME] = "Home", [KEY_UP] = "Up", [KEY_PAGEUP] = "PageUp", [KEY_LEFT] = "Left", [KEY_RIGHT] = "Right", [KEY_END] = "End", [KEY_DOWN] = "Down", [KEY_PAGEDOWN] = "PageDown", [KEY_INSERT] = "Insert", [KEY_DELETE] = "Delete", [KEY_MACRO] = "Macro", [KEY_MUTE] = "Mute", [KEY_VOLUMEDOWN] = "VolumeDown", [KEY_VOLUMEUP] = "VolumeUp", [KEY_POWER] = "Power", [KEY_KPEQUAL] = "KPEqual", [KEY_KPPLUSMINUS] = "KPPlusMinus", [KEY_PAUSE] = "Pause", [KEY_KPCOMMA] = "KPComma", [KEY_HANGUEL] = "Hangeul", [KEY_HANJA] = "Hanja", [KEY_YEN] = "Yen", [KEY_LEFTMETA] = "LeftMeta", [KEY_RIGHTMETA] = "RightMeta", [KEY_COMPOSE] = "Compose", [KEY_STOP] = "Stop", [KEY_AGAIN] = "Again", [KEY_PROPS] = "Props", [KEY_UNDO] = "Undo", [KEY_FRONT] = "Front", [KEY_COPY] = "Copy", [KEY_OPEN] = "Open", [KEY_PASTE] = "Paste", [KEY_FIND] = "Find", [KEY_CUT] = "Cut", [KEY_HELP] = "Help", [KEY_MENU] = "Menu", [KEY_CALC] = "Calc", [KEY_SETUP] = "Setup", [KEY_SLEEP] = "Sleep", [KEY_WAKEUP] = "WakeUp", [KEY_FILE] = "File", [KEY_SENDFILE] = "SendFile", [KEY_DELETEFILE] = "DeleteFile", [KEY_XFER] = "X-fer", [KEY_PROG1] = "Prog1", [KEY_PROG2] = "Prog2", [KEY_WWW] = "WWW", [KEY_MSDOS] = "MSDOS", [KEY_COFFEE] = "Coffee", [KEY_ROTATE_DISPLAY] = "RotateDisplay", [KEY_CYCLEWINDOWS] = "CycleWindows", [KEY_MAIL] = "Mail", [KEY_BOOKMARKS] = "Bookmarks", [KEY_COMPUTER] = "Computer", [KEY_BACK] = "Back", [KEY_FORWARD] = "Forward", [KEY_CLOSECD] = "CloseCD", [KEY_EJECTCD] = "EjectCD", [KEY_EJECTCLOSECD] = "EjectCloseCD", [KEY_NEXTSONG] = "NextSong", [KEY_PLAYPAUSE] = "PlayPause", [KEY_PREVIOUSSONG] = "PreviousSong", [KEY_STOPCD] = "StopCD", [KEY_RECORD] = "Record", [KEY_REWIND] = "Rewind", [KEY_PHONE] = "Phone", [KEY_ISO] = "ISOKey", [KEY_CONFIG] = "Config", [KEY_HOMEPAGE] = "HomePage", [KEY_REFRESH] = "Refresh", [KEY_EXIT] = "Exit", [KEY_MOVE] = "Move", [KEY_EDIT] = "Edit", [KEY_SCROLLUP] = "ScrollUp", [KEY_SCROLLDOWN] = "ScrollDown", [KEY_KPLEFTPAREN] = "KPLeftParenthesis", [KEY_KPRIGHTPAREN] = "KPRightParenthesis", [KEY_NEW] = "New", [KEY_REDO] = "Redo", [KEY_F13] = "F13", [KEY_F14] = "F14", [KEY_F15] = "F15", [KEY_F16] = "F16", [KEY_F17] = "F17", [KEY_F18] = "F18", [KEY_F19] = "F19", [KEY_F20] = "F20", [KEY_F21] = "F21", [KEY_F22] = "F22", [KEY_F23] = "F23", [KEY_F24] = "F24", [KEY_PLAYCD] = "PlayCD", [KEY_PAUSECD] = "PauseCD", [KEY_PROG3] = "Prog3", [KEY_PROG4] = "Prog4", [KEY_ALL_APPLICATIONS] = "AllApplications", [KEY_SUSPEND] = "Suspend", [KEY_CLOSE] = "Close", [KEY_PLAY] = "Play", [KEY_FASTFORWARD] = "FastForward", [KEY_BASSBOOST] = "BassBoost", [KEY_PRINT] = "Print", [KEY_HP] = "HP", [KEY_CAMERA] = "Camera", [KEY_SOUND] = "Sound", [KEY_QUESTION] = "Question", [KEY_EMAIL] = "Email", [KEY_CHAT] = "Chat", [KEY_SEARCH] = "Search", [KEY_CONNECT] = "Connect", [KEY_FINANCE] = "Finance", [KEY_SPORT] = "Sport", [KEY_SHOP] = "Shop", [KEY_ALTERASE] = "AlternateErase", [KEY_CANCEL] = "Cancel", [KEY_BRIGHTNESSDOWN] = "BrightnessDown", [KEY_BRIGHTNESSUP] = "BrightnessUp", [KEY_MEDIA] = "Media", [KEY_UNKNOWN] = "Unknown", [BTN_DPAD_UP] = "BtnDPadUp", [BTN_DPAD_DOWN] = "BtnDPadDown", [BTN_DPAD_LEFT] = "BtnDPadLeft", [BTN_DPAD_RIGHT] = "BtnDPadRight", [BTN_0] = "Btn0", [BTN_1] = "Btn1", [BTN_2] = "Btn2", [BTN_3] = "Btn3", [BTN_4] = "Btn4", [BTN_5] = "Btn5", [BTN_6] = "Btn6", [BTN_7] = "Btn7", [BTN_8] = "Btn8", [BTN_9] = "Btn9", [BTN_LEFT] = "LeftBtn", [BTN_RIGHT] = "RightBtn", [BTN_MIDDLE] = "MiddleBtn", [BTN_SIDE] = "SideBtn", [BTN_EXTRA] = "ExtraBtn", [BTN_FORWARD] = "ForwardBtn", [BTN_BACK] = "BackBtn", [BTN_TASK] = "TaskBtn", [BTN_TRIGGER] = "Trigger", [BTN_THUMB] = "ThumbBtn", [BTN_THUMB2] = "ThumbBtn2", [BTN_TOP] = "TopBtn", [BTN_TOP2] = "TopBtn2", [BTN_PINKIE] = "PinkieBtn", [BTN_BASE] = "BaseBtn", [BTN_BASE2] = "BaseBtn2", [BTN_BASE3] = "BaseBtn3", [BTN_BASE4] = "BaseBtn4", [BTN_BASE5] = "BaseBtn5", [BTN_BASE6] = "BaseBtn6", [BTN_DEAD] = "BtnDead", [BTN_A] = "BtnA", [BTN_B] = "BtnB", [BTN_C] = "BtnC", [BTN_X] = "BtnX", [BTN_Y] = "BtnY", [BTN_Z] = "BtnZ", [BTN_TL] = "BtnTL", [BTN_TR] = "BtnTR", [BTN_TL2] = "BtnTL2", [BTN_TR2] = "BtnTR2", [BTN_SELECT] = "BtnSelect", [BTN_START] = "BtnStart", [BTN_MODE] = "BtnMode", [BTN_THUMBL] = "BtnThumbL", [BTN_THUMBR] = "BtnThumbR", [BTN_GRIPL] = "BtnGripL", [BTN_GRIPR] = "BtnGripR", [BTN_GRIPL2] = "BtnGripL2", [BTN_GRIPR2] = "BtnGripR2", [BTN_TOOL_PEN] = "ToolPen", [BTN_TOOL_RUBBER] = "ToolRubber", [BTN_TOOL_BRUSH] = "ToolBrush", [BTN_TOOL_PENCIL] = "ToolPencil", [BTN_TOOL_AIRBRUSH] = "ToolAirbrush", [BTN_TOOL_FINGER] = "ToolFinger", [BTN_TOOL_MOUSE] = "ToolMouse", [BTN_TOOL_LENS] = "ToolLens", [BTN_TOUCH] = "Touch", [BTN_STYLUS] = "Stylus", [BTN_STYLUS2] = "Stylus2", [BTN_TOOL_DOUBLETAP] = "ToolDoubleTap", [BTN_TOOL_TRIPLETAP] = "ToolTripleTap", [BTN_TOOL_QUADTAP] = "ToolQuadrupleTap", [BTN_GEAR_DOWN] = "BtnGearDown", [BTN_GEAR_UP] = "BtnGearUp", [KEY_OK] = "Ok", [KEY_SELECT] = "Select", [KEY_GOTO] = "Goto", [KEY_CLEAR] = "Clear", [KEY_POWER2] = "Power2", [KEY_OPTION] = "Option", [KEY_INFO] = "Info", [KEY_TIME] = "Time", [KEY_VENDOR] = "Vendor", [KEY_ARCHIVE] = "Archive", [KEY_PROGRAM] = "Program", [KEY_CHANNEL] = "Channel", [KEY_FAVORITES] = "Favorites", [KEY_EPG] = "EPG", [KEY_PVR] = "PVR", [KEY_MHP] = "MHP", [KEY_LANGUAGE] = "Language", [KEY_TITLE] = "Title", [KEY_SUBTITLE] = "Subtitle", [KEY_ANGLE] = "Angle", [KEY_MODE] = "Mode", [KEY_KEYBOARD] = "Keyboard", [KEY_PC] = "PC", [KEY_TV] = "TV", [KEY_TV2] = "TV2", [KEY_VCR] = "VCR", [KEY_VCR2] = "VCR2", [KEY_SAT] = "Sat", [KEY_SAT2] = "Sat2", [KEY_CD] = "CD", [KEY_TAPE] = "Tape", [KEY_RADIO] = "Radio", [KEY_TUNER] = "Tuner", [KEY_PLAYER] = "Player", [KEY_TEXT] = "Text", [KEY_DVD] = "DVD", [KEY_AUX] = "Aux", [KEY_MP3] = "MP3", [KEY_AUDIO] = "Audio", [KEY_VIDEO] = "Video", [KEY_DIRECTORY] = "Directory", [KEY_LIST] = "List", [KEY_MEMO] = "Memo", [KEY_CALENDAR] = "Calendar", [KEY_RED] = "Red", [KEY_GREEN] = "Green", [KEY_YELLOW] = "Yellow", [KEY_BLUE] = "Blue", [KEY_CHANNELUP] = "ChannelUp", [KEY_CHANNELDOWN] = "ChannelDown", [KEY_FIRST] = "First", [KEY_LAST] = "Last", [KEY_AB] = "AB", [KEY_NEXT] = "Next", [KEY_RESTART] = "Restart", [KEY_SLOW] = "Slow", [KEY_SHUFFLE] = "Shuffle", [KEY_BREAK] = "Break", [KEY_PREVIOUS] = "Previous", [KEY_DIGITS] = "Digits", [KEY_TEEN] = "TEEN", [KEY_TWEN] = "TWEN", [KEY_DEL_EOL] = "DeleteEOL", [KEY_DEL_EOS] = "DeleteEOS", [KEY_INS_LINE] = "InsertLine", [KEY_DEL_LINE] = "DeleteLine", [KEY_SEND] = "Send", [KEY_REPLY] = "Reply", [KEY_FORWARDMAIL] = "ForwardMail", [KEY_SAVE] = "Save", [KEY_DOCUMENTS] = "Documents", [KEY_SPELLCHECK] = "SpellCheck", [KEY_LOGOFF] = "Logoff", [KEY_FN] = "Fn", [KEY_FN_ESC] = "Fn+ESC", [KEY_FN_1] = "Fn+1", [KEY_FN_2] = "Fn+2", [KEY_FN_B] = "Fn+B", [KEY_FN_D] = "Fn+D", [KEY_FN_E] = "Fn+E", [KEY_FN_F] = "Fn+F", [KEY_FN_S] = "Fn+S", [KEY_FN_F1] = "Fn+F1", [KEY_FN_F2] = "Fn+F2", [KEY_FN_F3] = "Fn+F3", [KEY_FN_F4] = "Fn+F4", [KEY_FN_F5] = "Fn+F5", [KEY_FN_F6] = "Fn+F6", [KEY_FN_F7] = "Fn+F7", [KEY_FN_F8] = "Fn+F8", [KEY_FN_F9] = "Fn+F9", [KEY_FN_F10] = "Fn+F10", [KEY_FN_F11] = "Fn+F11", [KEY_FN_F12] = "Fn+F12", [KEY_KBDILLUMTOGGLE] = "KbdIlluminationToggle", [KEY_KBDILLUMDOWN] = "KbdIlluminationDown", [KEY_KBDILLUMUP] = "KbdIlluminationUp", [KEY_SWITCHVIDEOMODE] = "SwitchVideoMode", [KEY_BUTTONCONFIG] = "ButtonConfig", [KEY_TASKMANAGER] = "TaskManager", [KEY_JOURNAL] = "Journal", [KEY_CONTROLPANEL] = "ControlPanel", [KEY_APPSELECT] = "AppSelect", [KEY_SCREENSAVER] = "ScreenSaver", [KEY_VOICECOMMAND] = "VoiceCommand", [KEY_ASSISTANT] = "Assistant", [KEY_KBD_LAYOUT_NEXT] = "KbdLayoutNext", [KEY_EMOJI_PICKER] = "EmojiPicker", [KEY_CAMERA_ACCESS_ENABLE] = "CameraAccessEnable", [KEY_CAMERA_ACCESS_DISABLE] = "CameraAccessDisable", [KEY_CAMERA_ACCESS_TOGGLE] = "CameraAccessToggle", [KEY_ACCESSIBILITY] = "Accessibility", [KEY_DO_NOT_DISTURB] = "DoNotDisturb", [KEY_DICTATE] = "Dictate", [KEY_MICMUTE] = "MicrophoneMute", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", [KEY_KBDINPUTASSIST_PREV] = "KbdInputAssistPrev", [KEY_KBDINPUTASSIST_NEXT] = "KbdInputAssistNext", [KEY_KBDINPUTASSIST_PREVGROUP] = "KbdInputAssistPrevGroup", [KEY_KBDINPUTASSIST_NEXTGROUP] = "KbdInputAssistNextGroup", [KEY_KBDINPUTASSIST_ACCEPT] = "KbdInputAssistAccept", [KEY_KBDINPUTASSIST_CANCEL] = "KbdInputAssistCancel", [KEY_MACRO1] = "Macro1", [KEY_MACRO2] = "Macro2", [KEY_MACRO3] = "Macro3", [KEY_MACRO4] = "Macro4", [KEY_MACRO5] = "Macro5", [KEY_MACRO6] = "Macro6", [KEY_MACRO7] = "Macro7", [KEY_MACRO8] = "Macro8", [KEY_MACRO9] = "Macro9", [KEY_MACRO10] = "Macro10", [KEY_MACRO11] = "Macro11", [KEY_MACRO12] = "Macro12", [KEY_MACRO13] = "Macro13", [KEY_MACRO14] = "Macro14", [KEY_MACRO15] = "Macro15", [KEY_MACRO16] = "Macro16", [KEY_MACRO17] = "Macro17", [KEY_MACRO18] = "Macro18", [KEY_MACRO19] = "Macro19", [KEY_MACRO20] = "Macro20", [KEY_MACRO21] = "Macro21", [KEY_MACRO22] = "Macro22", [KEY_MACRO23] = "Macro23", [KEY_MACRO24] = "Macro24", [KEY_MACRO25] = "Macro25", [KEY_MACRO26] = "Macro26", [KEY_MACRO27] = "Macro27", [KEY_MACRO28] = "Macro28", [KEY_MACRO29] = "Macro29", [KEY_MACRO30] = "Macro30", [BTN_TRIGGER_HAPPY1] = "TriggerHappy1", [BTN_TRIGGER_HAPPY2] = "TriggerHappy2", [BTN_TRIGGER_HAPPY3] = "TriggerHappy3", [BTN_TRIGGER_HAPPY4] = "TriggerHappy4", [BTN_TRIGGER_HAPPY5] = "TriggerHappy5", [BTN_TRIGGER_HAPPY6] = "TriggerHappy6", [BTN_TRIGGER_HAPPY7] = "TriggerHappy7", [BTN_TRIGGER_HAPPY8] = "TriggerHappy8", [BTN_TRIGGER_HAPPY9] = "TriggerHappy9", [BTN_TRIGGER_HAPPY10] = "TriggerHappy10", [BTN_TRIGGER_HAPPY11] = "TriggerHappy11", [BTN_TRIGGER_HAPPY12] = "TriggerHappy12", [BTN_TRIGGER_HAPPY13] = "TriggerHappy13", [BTN_TRIGGER_HAPPY14] = "TriggerHappy14", [BTN_TRIGGER_HAPPY15] = "TriggerHappy15", [BTN_TRIGGER_HAPPY16] = "TriggerHappy16", [BTN_TRIGGER_HAPPY17] = "TriggerHappy17", [BTN_TRIGGER_HAPPY18] = "TriggerHappy18", [BTN_TRIGGER_HAPPY19] = "TriggerHappy19", [BTN_TRIGGER_HAPPY20] = "TriggerHappy20", [BTN_TRIGGER_HAPPY21] = "TriggerHappy21", [BTN_TRIGGER_HAPPY22] = "TriggerHappy22", [BTN_TRIGGER_HAPPY23] = "TriggerHappy23", [BTN_TRIGGER_HAPPY24] = "TriggerHappy24", [BTN_TRIGGER_HAPPY25] = "TriggerHappy25", [BTN_TRIGGER_HAPPY26] = "TriggerHappy26", [BTN_TRIGGER_HAPPY27] = "TriggerHappy27", [BTN_TRIGGER_HAPPY28] = "TriggerHappy28", [BTN_TRIGGER_HAPPY29] = "TriggerHappy29", [BTN_TRIGGER_HAPPY30] = "TriggerHappy30", [BTN_TRIGGER_HAPPY31] = "TriggerHappy31", [BTN_TRIGGER_HAPPY32] = "TriggerHappy32", [BTN_TRIGGER_HAPPY33] = "TriggerHappy33", [BTN_TRIGGER_HAPPY34] = "TriggerHappy34", [BTN_TRIGGER_HAPPY35] = "TriggerHappy35", [BTN_TRIGGER_HAPPY36] = "TriggerHappy36", [BTN_TRIGGER_HAPPY37] = "TriggerHappy37", [BTN_TRIGGER_HAPPY38] = "TriggerHappy38", [BTN_TRIGGER_HAPPY39] = "TriggerHappy39", [BTN_TRIGGER_HAPPY40] = "TriggerHappy40", [BTN_STYLUS3] = "Stylus3", [BTN_TOOL_QUINTTAP] = "ToolQuintTap", [KEY_10CHANNELSDOWN] = "10ChannelsDown", [KEY_10CHANNELSUP] = "10ChannelsUp", [KEY_3D_MODE] = "3DMode", [KEY_ADDRESSBOOK] = "Addressbook", [KEY_ALS_TOGGLE] = "ALSToggle", [KEY_ASPECT_RATIO] = "AspectRatio", [KEY_ATTENDANT_OFF] = "AttendantOff", [KEY_ATTENDANT_ON] = "AttendantOn", [KEY_ATTENDANT_TOGGLE] = "AttendantToggle", [KEY_AUDIO_DESC] = "AudioDesc", [KEY_AUTOPILOT_ENGAGE_TOGGLE] = "AutoPiloteEngage", [KEY_BATTERY] = "Battery", [KEY_BLUETOOTH] = "BlueTooth", [KEY_BRIGHTNESS_CYCLE] = "BrightnessCycle", [KEY_BRIGHTNESS_MENU] = "BrightnessMenu", [KEY_BRL_DOT1] = "BrlDot1", [KEY_BRL_DOT10] = "BrlDot10", [KEY_BRL_DOT2] = "BrlDot2", [KEY_BRL_DOT3] = "BrlDot3", [KEY_BRL_DOT4] = "BrlDot4", [KEY_BRL_DOT5] = "BrlDot5", [KEY_BRL_DOT6] = "BrlDot6", [KEY_BRL_DOT7] = "BrlDot7", [KEY_BRL_DOT8] = "BrlDot8", [KEY_BRL_DOT9] = "BrlDot9", [KEY_CAMERA_DOWN] = "CameraDown", [KEY_CAMERA_FOCUS] = "CameraFocus", [KEY_CAMERA_LEFT] = "CameraLeft", [KEY_CAMERA_RIGHT] = "CameraRight", [KEY_CAMERA_UP] = "CameraUp", [KEY_CAMERA_ZOOMIN] = "CameraZoomIn", [KEY_CAMERA_ZOOMOUT] = "CameraZoomOut", [KEY_CLEARVU_SONAR] = "ClearVUSonar", [KEY_CONTEXT_MENU] = "ContextMenu", [KEY_DATA] = "Data", [KEY_DATABASE] = "DataBase", [KEY_DISPLAY_OFF] = "DisplayOff", [KEY_DISPLAYTOGGLE] = "DisplayToggle", [KEY_DOLLAR] = "Dollar", [KEY_DUAL_RANGE_RADAR] = "DualRangeRadat", [KEY_EDITOR] = "Editor", [KEY_EURO] = "Euro", [KEY_FASTREVERSE] = "FastReverse", [KEY_FISHING_CHART] = "FishingChart", [KEY_FN_RIGHT_SHIFT] = "FnRightShift", [KEY_FRAMEBACK] = "FrameBack", [KEY_FRAMEFORWARD] = "FrameForward", [KEY_FULL_SCREEN] = "FullScreen", [KEY_GAMES] = "Games", [KEY_GRAPHICSEDITOR] = "GraphicsEditor", [KEY_HANGUP_PHONE] = "HangUpPhone", [KEY_IMAGES] = "Images", [KEY_KBD_LCD_MENU1] = "KbdLcdMenu1", [KEY_KBD_LCD_MENU2] = "KbdLcdMenu2", [KEY_KBD_LCD_MENU3] = "KbdLcdMenu3", [KEY_KBD_LCD_MENU4] = "KbdLcdMenu4", [KEY_KBD_LCD_MENU5] = "KbdLcdMenu5", [KEY_LEFT_DOWN] = "LeftDown", [KEY_LEFT_UP] = "LeftUp", [KEY_LIGHTS_TOGGLE] = "LightToggle", [KEY_MACRO_PRESET1] = "MacroPreset1", [KEY_MACRO_PRESET2] = "MacroPreset2", [KEY_MACRO_PRESET3] = "MacroPrest3", [KEY_MACRO_PRESET_CYCLE] = "MacroPresetCycle", [KEY_MACRO_RECORD_START] = "MacroRecordStart", [KEY_MACRO_RECORD_STOP] = "MacroRecordStop", [KEY_MARK_WAYPOINT] = "MarkWayPoint", [KEY_MEDIA_REPEAT] = "MediaRepeat", [KEY_MEDIA_TOP_MENU] = "MediaTopMenu", [KEY_MESSENGER] = "Messenger", [KEY_NAV_CHART] = "NavChar", [KEY_NAV_INFO] = "NavInfo", [KEY_NEWS] = "News", [KEY_NEXT_ELEMENT] = "NextElement", [KEY_NEXT_FAVORITE] = "NextFavorite", [KEY_NOTIFICATION_CENTER] = "NotificationCenter", [KEY_NUMERIC_0] = "Numeric0", [KEY_NUMERIC_1] = "Numeric1", [KEY_NUMERIC_11] = "Numceric11", [KEY_NUMERIC_12] = "Numeric12", [KEY_NUMERIC_2] = "Numeric2", [KEY_NUMERIC_3] = "Numeric3", [KEY_NUMERIC_4] = "Numeric4", [KEY_NUMERIC_5] = "Numeric5", [KEY_NUMERIC_6] = "Numeric6", [KEY_NUMERIC_7] = "Numeric7", [KEY_NUMERIC_8] = "Numeric8", [KEY_NUMERIC_9] = "Numeric9", [KEY_NUMERIC_A] = "NumericA", [KEY_NUMERIC_B] = "NumericB", [KEY_NUMERIC_C] = "NumericC", [KEY_NUMERIC_D] = "NumericD", [KEY_NUMERIC_POUND] = "NumericPound", [KEY_NUMERIC_STAR] = "NumericStar", [KEY_ONSCREEN_KEYBOARD] = "OnScreenKeyBoard", [KEY_PAUSE_RECORD] = "PauseRecord", [KEY_PICKUP_PHONE] = "PickUpPhone", [KEY_PRESENTATION] = "Presentation", [KEY_PREVIOUS_ELEMENT] = "PreviousElement", [KEY_PRIVACY_SCREEN_TOGGLE] = "PrivacyScreenToggle", [KEY_RADAR_OVERLAY] = "RadarOverLay", [KEY_RFKILL] = "RFKill", [KEY_RIGHT_DOWN] = "RightDown", [KEY_RIGHT_UP] = "RightUp", [KEY_ROOT_MENU] = "RootMenu", [KEY_ROTATE_LOCK_TOGGLE] = "RotateLockToggle", [KEY_SCALE] = "Scale", [KEY_SELECTIVE_SCREENSHOT] = "SelectiveScreenshot", [KEY_SIDEVU_SONAR] = "SideVUSonar", [KEY_SINGLE_RANGE_RADAR] = "SingleRangeRadar", [KEY_SLOWREVERSE] = "SlowReverse", [KEY_SOS] = "SOS", [KEY_SPREADSHEET] = "SpreadSheet", [KEY_STOP_RECORD] = "StopRecord", [KEY_TOUCHPAD_OFF] = "TouchPadOff", [KEY_TOUCHPAD_ON] = "TouchPadOn", [KEY_TOUCHPAD_TOGGLE] = "TouchPadToggle", [KEY_TRADITIONAL_SONAR] = "TraditionalSonar", [KEY_UNMUTE] = "Unmute", [KEY_UWB] = "UWB", [KEY_VIDEO_NEXT] = "VideoNext", [KEY_VIDEOPHONE] = "VideoPhone", [KEY_VIDEO_PREV] = "VideoPrev", [KEY_VOD] = "VOD", [KEY_VOICEMAIL] = "VoiceMail", [KEY_WLAN] = "WLAN", [KEY_WORDPROCESSOR] = "WordProcessor", [KEY_WPS_BUTTON] = "WPSButton", [KEY_WWAN] = "WWAN", [KEY_ZOOMIN] = "ZoomIn", [KEY_ZOOMOUT] = "ZoomOut", [KEY_ZOOMRESET] = "ZoomReset", }; static const char *relatives[REL_MAX + 1] = { [REL_X] = "X", [REL_Y] = "Y", [REL_Z] = "Z", [REL_RX] = "Rx", [REL_RY] = "Ry", [REL_RZ] = "Rz", [REL_HWHEEL] = "HWheel", [REL_DIAL] = "Dial", [REL_WHEEL] = "Wheel", [REL_MISC] = "Misc", [REL_WHEEL_HI_RES] = "WheelHiRes", [REL_HWHEEL_HI_RES] = "HWheelHiRes" }; static const char *absolutes[ABS_CNT] = { [ABS_X] = "X", [ABS_Y] = "Y", [ABS_Z] = "Z", [ABS_RX] = "Rx", [ABS_RY] = "Ry", [ABS_RZ] = "Rz", [ABS_THROTTLE] = "Throttle", [ABS_RUDDER] = "Rudder", [ABS_WHEEL] = "Wheel", [ABS_GAS] = "Gas", [ABS_BRAKE] = "Brake", [ABS_HAT0X] = "Hat0X", [ABS_HAT0Y] = "Hat0Y", [ABS_HAT1X] = "Hat1X", [ABS_HAT1Y] = "Hat1Y", [ABS_HAT2X] = "Hat2X", [ABS_HAT2Y] = "Hat2Y", [ABS_HAT3X] = "Hat3X", [ABS_HAT3Y] = "Hat 3Y", [ABS_PRESSURE] = "Pressure", [ABS_DISTANCE] = "Distance", [ABS_TILT_X] = "XTilt", [ABS_TILT_Y] = "YTilt", [ABS_TOOL_WIDTH] = "ToolWidth", [ABS_VOLUME] = "Volume", [ABS_PROFILE] = "Profile", [ABS_MISC] = "Misc", [ABS_MT_SLOT] = "MTSlot", [ABS_MT_TOUCH_MAJOR] = "MTMajor", [ABS_MT_TOUCH_MINOR] = "MTMinor", [ABS_MT_WIDTH_MAJOR] = "MTMajorW", [ABS_MT_WIDTH_MINOR] = "MTMinorW", [ABS_MT_ORIENTATION] = "MTOrientation", [ABS_MT_POSITION_X] = "MTPositionX", [ABS_MT_POSITION_Y] = "MTPositionY", [ABS_MT_TOOL_TYPE] = "MTToolType", [ABS_MT_BLOB_ID] = "MTBlobID", [ABS_MT_TRACKING_ID] = "MTTrackingID", [ABS_MT_PRESSURE] = "MTPressure", [ABS_MT_DISTANCE] = "MTDistance", [ABS_MT_TOOL_X] = "MTToolX", [ABS_MT_TOOL_Y] = "MTToolY", }; static const char *misc[MSC_MAX + 1] = { [MSC_SERIAL] = "Serial", [MSC_PULSELED] = "Pulseled", [MSC_GESTURE] = "Gesture", [MSC_RAW] = "RawData", [MSC_SCAN] = "Scan", [MSC_TIMESTAMP] = "TimeStamp", }; static const char *leds[LED_MAX + 1] = { [LED_NUML] = "NumLock", [LED_CAPSL] = "CapsLock", [LED_SCROLLL] = "ScrollLock", [LED_COMPOSE] = "Compose", [LED_KANA] = "Kana", [LED_SLEEP] = "Sleep", [LED_SUSPEND] = "Suspend", [LED_MUTE] = "Mute", [LED_MISC] = "Misc", [LED_MAIL] = "Mail", [LED_CHARGING] = "Charging", }; static const char *repeats[REP_MAX + 1] = { [REP_DELAY] = "Delay", [REP_PERIOD] = "Period" }; static const char *sounds[SND_MAX + 1] = { [SND_CLICK] = "Click", [SND_BELL] = "Bell", [SND_TONE] = "Tone" }; static const char *software[SW_CNT] = { [SW_LID] = "Lid", [SW_TABLET_MODE] = "TabletMode", [SW_HEADPHONE_INSERT] = "HeadPhoneInsert", [SW_RFKILL_ALL] = "RFKillAll", [SW_MICROPHONE_INSERT] = "MicrophoneInsert", [SW_DOCK] = "Dock", [SW_LINEOUT_INSERT] = "LineOutInsert", [SW_JACK_PHYSICAL_INSERT] = "JackPhysicalInsert", [SW_VIDEOOUT_INSERT] = "VideoOutInsert", [SW_CAMERA_LENS_COVER] = "CameraLensCover", [SW_KEYPAD_SLIDE] = "KeyPadSlide", [SW_FRONT_PROXIMITY] = "FrontProximity", [SW_ROTATE_LOCK] = "RotateLock", [SW_LINEIN_INSERT] = "LineInInsert", [SW_MUTE_DEVICE] = "MuteDevice", [SW_PEN_INSERTED] = "PenInserted", [SW_MACHINE_COVER] = "MachineCover", }; static const char *force[FF_CNT] = { [FF_RUMBLE] = "FF_RUMBLE", [FF_PERIODIC] = "FF_PERIODIC", [FF_CONSTANT] = "FF_CONSTANT", [FF_SPRING] = "FF_SPRING", [FF_FRICTION] = "FF_FRICTION", [FF_DAMPER] = "FF_DAMPER", [FF_INERTIA] = "FF_INERTIA", [FF_RAMP] = "FF_RAMP", [FF_SQUARE] = "FF_SQUARE", [FF_TRIANGLE] = "FF_TRIANGLE", [FF_SINE] = "FF_SINE", [FF_SAW_UP] = "FF_SAW_UP", [FF_SAW_DOWN] = "FF_SAW_DOWN", [FF_CUSTOM] = "FF_CUSTOM", [FF_GAIN] = "FF_GAIN", [FF_AUTOCENTER] = "FF_AUTOCENTER", [FF_MAX] = "FF_MAX", }; static const char *force_status[FF_STATUS_MAX + 1] = { [FF_STATUS_STOPPED] = "FF_STATUS_STOPPED", [FF_STATUS_PLAYING] = "FF_STATUS_PLAYING", }; static const char **names[EV_MAX + 1] = { [EV_SYN] = syncs, [EV_KEY] = keys, [EV_REL] = relatives, [EV_ABS] = absolutes, [EV_MSC] = misc, [EV_LED] = leds, [EV_SND] = sounds, [EV_REP] = repeats, [EV_SW] = software, [EV_FF] = force, [EV_FF_STATUS] = force_status, }; static void hid_resolv_event(__u8 type, __u16 code, struct seq_file *f) { if (events[type]) seq_printf(f, "%s.", events[type]); else seq_printf(f, "%02x.", type); if (names[type] && names[type][code]) seq_printf(f, "%s", names[type][code]); else seq_printf(f, "%04x", code); } static void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f) { int i, j, k; struct hid_report *report; struct hid_usage *usage; for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) { list_for_each_entry(report, &hid->report_enum[k].report_list, list) { for (i = 0; i < report->maxfield; i++) { for ( j = 0; j < report->field[i]->maxusage; j++) { usage = report->field[i]->usage + j; hid_resolv_usage(usage->hid, f); seq_printf(f, " ---> "); hid_resolv_event(usage->type, usage->code, f); seq_printf(f, "\n"); } } } } } static int hid_debug_rdesc_show(struct seq_file *f, void *p) { struct hid_device *hdev = f->private; const __u8 *rdesc = hdev->rdesc; unsigned rsize = hdev->rsize; int i; if (!rdesc) { rdesc = hdev->dev_rdesc; rsize = hdev->dev_rsize; } /* dump HID report descriptor */ for (i = 0; i < rsize; i++) seq_printf(f, "%02x ", rdesc[i]); seq_printf(f, "\n\n"); /* dump parsed data and input mappings */ if (down_interruptible(&hdev->driver_input_lock)) return 0; hid_dump_device(hdev, f); seq_printf(f, "\n"); hid_dump_input_mapping(hdev, f); up(&hdev->driver_input_lock); return 0; } static int hid_debug_events_open(struct inode *inode, struct file *file) { int err = 0; struct hid_debug_list *list; unsigned long flags; if (!(list = kzalloc(sizeof(struct hid_debug_list), GFP_KERNEL))) { err = -ENOMEM; goto out; } err = kfifo_alloc(&list->hid_debug_fifo, HID_DEBUG_FIFOSIZE, GFP_KERNEL); if (err) { kfree(list); goto out; } list->hdev = (struct hid_device *) inode->i_private; kref_get(&list->hdev->ref); file->private_data = list; mutex_init(&list->read_mutex); spin_lock_irqsave(&list->hdev->debug_list_lock, flags); list_add_tail(&list->node, &list->hdev->debug_list); spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags); out: return err; } static ssize_t hid_debug_events_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct hid_debug_list *list = file->private_data; int ret = 0, copied; DECLARE_WAITQUEUE(wait, current); mutex_lock(&list->read_mutex); if (kfifo_is_empty(&list->hid_debug_fifo)) { add_wait_queue(&list->hdev->debug_wait, &wait); set_current_state(TASK_INTERRUPTIBLE); while (kfifo_is_empty(&list->hid_debug_fifo)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* if list->hdev is NULL we cannot remove_wait_queue(). * if list->hdev->debug is 0 then hid_debug_unregister() * was already called and list->hdev is being destroyed. * if we add remove_wait_queue() here we can hit a race. */ if (!list->hdev || !list->hdev->debug) { ret = -EIO; __set_current_state(TASK_RUNNING); goto out; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } /* allow O_NONBLOCK from other threads */ mutex_unlock(&list->read_mutex); schedule(); mutex_lock(&list->read_mutex); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); remove_wait_queue(&list->hdev->debug_wait, &wait); if (ret) goto out; } /* pass the fifo content to userspace, locking is not needed with only * one concurrent reader and one concurrent writer */ ret = kfifo_to_user(&list->hid_debug_fifo, buffer, count, &copied); if (ret) goto out; ret = copied; out: mutex_unlock(&list->read_mutex); return ret; } static __poll_t hid_debug_events_poll(struct file *file, poll_table *wait) { struct hid_debug_list *list = file->private_data; poll_wait(file, &list->hdev->debug_wait, wait); if (!kfifo_is_empty(&list->hid_debug_fifo)) return EPOLLIN | EPOLLRDNORM; if (!list->hdev->debug) return EPOLLERR | EPOLLHUP; return 0; } static int hid_debug_events_release(struct inode *inode, struct file *file) { struct hid_debug_list *list = file->private_data; unsigned long flags; spin_lock_irqsave(&list->hdev->debug_list_lock, flags); list_del(&list->node); spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags); kfifo_free(&list->hid_debug_fifo); kref_put(&list->hdev->ref, hiddev_free); kfree(list); return 0; } DEFINE_SHOW_ATTRIBUTE(hid_debug_rdesc); static const struct file_operations hid_debug_events_fops = { .owner = THIS_MODULE, .open = hid_debug_events_open, .read = hid_debug_events_read, .poll = hid_debug_events_poll, .release = hid_debug_events_release, .llseek = noop_llseek, }; void hid_debug_register(struct hid_device *hdev, const char *name) { hdev->debug_dir = debugfs_create_dir(name, hid_debug_root); hdev->debug_rdesc = debugfs_create_file("rdesc", 0400, hdev->debug_dir, hdev, &hid_debug_rdesc_fops); hdev->debug_events = debugfs_create_file("events", 0400, hdev->debug_dir, hdev, &hid_debug_events_fops); hdev->debug = 1; } void hid_debug_unregister(struct hid_device *hdev) { hdev->debug = 0; wake_up_interruptible(&hdev->debug_wait); debugfs_remove(hdev->debug_rdesc); debugfs_remove(hdev->debug_events); debugfs_remove(hdev->debug_dir); } void hid_debug_init(void) { hid_debug_root = debugfs_create_dir("hid", NULL); } void hid_debug_exit(void) { debugfs_remove_recursive(hid_debug_root); } |
| 1 1 1 4 5 1 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> */ #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> static unsigned int netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; struct nf_nat_range2 newrange; struct nf_conn *ct; enum ip_conntrack_info ctinfo; union nf_inet_addr new_addr, netmask; unsigned int i; ct = nf_ct_get(skb, &ctinfo); for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++) netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); if (xt_hooknum(par) == NF_INET_PRE_ROUTING || xt_hooknum(par) == NF_INET_LOCAL_OUT) new_addr.in6 = ipv6_hdr(skb)->daddr; else new_addr.in6 = ipv6_hdr(skb)->saddr; for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) { new_addr.ip6[i] &= ~netmask.ip6[i]; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask.ip6[i]; } newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr = new_addr; newrange.max_addr = new_addr; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); } static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } static void netmap_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static unsigned int netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 newrange; WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && xt_hooknum(par) != NF_INET_POST_ROUTING && xt_hooknum(par) != NF_INET_LOCAL_OUT && xt_hooknum(par) != NF_INET_LOCAL_IN); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); if (xt_hooknum(par) == NF_INET_PRE_ROUTING || xt_hooknum(par) == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; new_ip |= mr->range[0].min_ip & netmask; memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = new_ip; newrange.max_addr.ip = new_ip; newrange.min_proto = mr->range[0].min; newrange.max_proto = mr->range[0].max; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); } static int netmap_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static struct xt_target netmap_tg_reg[] __read_mostly = { { .name = "NETMAP", .family = NFPROTO_IPV6, .revision = 0, .target = netmap_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg6_checkentry, .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, { .name = "NETMAP", .family = NFPROTO_IPV4, .revision = 0, .target = netmap_tg4, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg4_check, .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, }; static int __init netmap_tg_init(void) { return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); } static void netmap_tg_exit(void) { xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); } module_init(netmap_tg_init); module_exit(netmap_tg_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ip6t_NETMAP"); MODULE_ALIAS("ipt_NETMAP"); |
| 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 | // SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <net/checksum.h> #include <net/dst_cache.h> #include <net/ip.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> #include "ila.h" struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( struct lwtunnel_state *lwt) { return (struct ila_lwt *)lwt->data; } static inline struct ila_params *ila_params_lwtunnel( struct lwtunnel_state *lwt) { return &ila_lwt_lwtunnel(lwt)->p; } static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for * dest cache route. */ return orig_dst->lwtstate->orig_output(net, sk, skb); } local_bh_disable(); dst = dst_cache_get(&ilwt->dst_cache); local_bh_enable(); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; /* Lookup a route for the new destination. Take into * account that the base route may already have a gateway. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = dst_dev(orig_dst)->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = -EHOSTUNREACH; dst_release(dst); goto drop; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto drop; } /* cache only if we don't create a dst reference loop */ if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); local_bh_enable(); } } skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (!ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); return -EINVAL; } static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct ila_lwt *ilwt; struct ila_params *p; struct nlattr *tb[ILA_ATTR_MAX + 1]; struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; u8 ident_type = ILA_ATYPE_USE_FORMAT; u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; u8 csum_mode = ILA_CSUM_NO_ACTION; bool lwt_output = true; u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; iaddr = (struct ila_addr *)&cfg6->fc_dst; if (tb[ILA_ATTR_IDENT_TYPE]) ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); if (ident_type == ILA_ATYPE_USE_FORMAT) { /* Infer identifier type from type field in formatted * identifier. */ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { /* Need to have full locator and at least type field * included in destination */ return -EINVAL; } eff_ident_type = iaddr->ident.type; } else { eff_ident_type = ident_type; } switch (eff_ident_type) { case ILA_ATYPE_IID: /* Don't allow ILA for IID type */ return -EINVAL; case ILA_ATYPE_LUID: break; case ILA_ATYPE_VIRT_V4: case ILA_ATYPE_VIRT_UNI_V6: case ILA_ATYPE_VIRT_MULTI_V6: case ILA_ATYPE_NONLOCAL_ADDR: /* These ILA formats are not supported yet. */ default: return -EINVAL; } if (tb[ILA_ATTR_HOOK_TYPE]) hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); switch (hook_type) { case ILA_HOOK_ROUTE_OUTPUT: lwt_output = true; break; case ILA_HOOK_ROUTE_INPUT: lwt_output = false; break; default: return -EINVAL; } if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); if (csum_mode == ILA_CSUM_NEUTRAL_MAP && ila_csum_neutral_set(iaddr->ident)) { /* Don't allow translation if checksum neutral bit is * configured and it's set in the SIR address. */ return -EINVAL; } newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) return -ENOMEM; ilwt = ila_lwt_lwtunnel(newts); ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC); if (ret) { kfree(newts); return ret; } ilwt->lwt_output = !!lwt_output; p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; ila_init_saved_csum(p); newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | LWTUNNEL_STATE_INPUT_REDIRECT; if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr)) ilwt->connected = 1; *ts = newts; return 0; } static void ila_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache); } static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : ILA_HOOK_ROUTE_INPUT)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct ila_params *a_p = ila_params_lwtunnel(a); struct ila_params *b_p = ila_params_lwtunnel(b); return (a_p->locator.v64 != b_p->locator.v64); } static const struct lwtunnel_encap_ops ila_encap_ops = { .build_state = ila_build_state, .destroy_state = ila_destroy_state, .output = ila_output, .input = ila_input, .fill_encap = ila_fill_encap_info, .get_encap_size = ila_encap_nlsize, .cmp_encap = ila_encap_cmp, .owner = THIS_MODULE, }; int ila_lwt_init(void) { return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } void ila_lwt_fini(void) { lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } |
| 135 135 135 135 135 135 135 135 135 135 135 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | /* mpiutil.ac - Utility functions for MPI * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include <linux/export.h> #include "mpi-internal.h" /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need * integers of n bits - So we should change this to bits (or bytes). * * But mpi_alloc is used in a lot of places :-) */ MPI mpi_alloc(unsigned nlimbs) { MPI a; a = kmalloc(sizeof *a, GFP_KERNEL); if (!a) return a; if (nlimbs) { a->d = mpi_alloc_limb_space(nlimbs); if (!a->d) { kfree(a); return NULL; } } else { a->d = NULL; } a->alloced = nlimbs; a->nlimbs = 0; a->sign = 0; a->flags = 0; a->nbits = 0; return a; } EXPORT_SYMBOL_GPL(mpi_alloc); mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs) { size_t len = nlimbs * sizeof(mpi_limb_t); if (!len) return NULL; return kmalloc(len, GFP_KERNEL); } void mpi_free_limb_space(mpi_ptr_t a) { if (!a) return; kfree_sensitive(a); } void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) { mpi_free_limb_space(a->d); a->d = ap; a->alloced = nlimbs; } /**************** * Resize the array of A to NLIMBS. the additional space is cleared * (set to 0) [done by m_realloc()] */ int mpi_resize(MPI a, unsigned nlimbs) { void *p; if (nlimbs <= a->alloced) return 0; /* no need to do it */ if (a->d) { p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!p) return -ENOMEM; memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); kfree_sensitive(a->d); a->d = p; } else { a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL); if (!a->d) return -ENOMEM; } a->alloced = nlimbs; return 0; } void mpi_free(MPI a) { if (!a) return; if (a->flags & 4) kfree_sensitive(a->d); else mpi_free_limb_space(a->d); if (a->flags & ~7) pr_info("invalid flag value in mpi\n"); kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); /**************** * Note: This copy function should not interpret the MPI * but copy it transparently. */ MPI mpi_copy(MPI a) { int i; MPI b; if (a) { b = mpi_alloc(a->nlimbs); if (!b) return NULL; b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; b->flags &= ~(16|32); /* Reset the immutable and constant flags. */ for (i = 0; i < b->nlimbs; i++) b->d[i] = a->d[i]; } else b = NULL; return b; } MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL"); |
| 2073 2072 1549 2072 2072 2070 29035 29040 24165 14450 14452 26399 26406 23579 7799 3238 5885 3036 3037 3 3038 573 44 573 573 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 | // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; #ifdef CONFIG_DEBUG_FS #include <linux/xarray.h> /* * ref_tracker_dir_init() is usually called in allocation-safe contexts, but * the same is not true of ref_tracker_dir_exit() which can be called from * anywhere an object is freed. Removing debugfs dentries is a blocking * operation, so we defer that work to the debugfs_reap_worker. * * Each dentry is tracked in the appropriate xarray. When * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and * the workqueue job is scheduled. The worker then runs and deletes any marked * dentries asynchronously. */ static struct xarray debugfs_dentries; static struct xarray debugfs_symlinks; static struct work_struct debugfs_reap_worker; #define REF_TRACKER_DIR_DEAD XA_MARK_0 static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { unsigned long flags; xa_lock_irqsave(&debugfs_dentries, flags); __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_dentries, flags); xa_lock_irqsave(&debugfs_symlinks, flags); __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD); xa_unlock_irqrestore(&debugfs_symlinks, flags); schedule_work(&debugfs_reap_worker); } #else static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir) { } #endif static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...); char *prefix; char *buf; struct seq_file *seq; int size, used; }; static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); vprintk(fmt, args); va_end(args); } static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...) { int ret, len = stream->size - stream->used; va_list args; va_start(args, fmt); ret = vsnprintf(stream->buf + stream->used, len, fmt, args); va_end(args); if (ret > 0) stream->used += min(ret, len); } #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ _s->func(_s, fmt, ##args); \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n", s->prefix, dir->class, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix, dir->class, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n", s->prefix, dir->class, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = { .func = pr_ostream_log, .prefix = "ref_tracker: " }; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .func = pr_ostream_buf, .prefix = "ref_tracker: ", .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; /* * The xarray entries must be marked before the dir->lock is taken to * protect simultaneous debugfs readers. */ ref_tracker_debugfs_mark(dir); spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT; static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...) { va_list args; va_start(args, fmt); seq_vprintf(stream->seq, fmt, args); va_end(args); } static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq) { struct ostream os = { .func = pr_ostream_seq, .prefix = "", .seq = seq }; __ref_tracker_dir_pr_ostream(dir, 16, &os); return os.used; } static int ref_tracker_debugfs_show(struct seq_file *f, void *v) { struct ref_tracker_dir *dir = f->private; unsigned long index = (unsigned long)dir; unsigned long flags; int ret; /* * "dir" may not exist at this point if ref_tracker_dir_exit() has * already been called. Take care not to dereference it until its * legitimacy is established. * * The xa_lock is necessary to ensure that "dir" doesn't disappear * before its lock can be taken. If it's in the hash and not marked * dead, then it's safe to take dir->lock which prevents * ref_tracker_dir_exit() from completing. Once the dir->lock is * acquired, the xa_lock can be released. All of this must be IRQ-safe. */ xa_lock_irqsave(&debugfs_dentries, flags); if (!xa_load(&debugfs_dentries, index) || xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) { xa_unlock_irqrestore(&debugfs_dentries, flags); return -ENODATA; } spin_lock(&dir->lock); xa_unlock(&debugfs_dentries); ret = ref_tracker_dir_seq_print(dir, f); spin_unlock_irqrestore(&dir->lock, flags); return ret; } static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp) { struct ref_tracker_dir *dir = inode->i_private; return single_open(filp, ref_tracker_debugfs_show, dir); } static const struct file_operations ref_tracker_debugfs_fops = { .owner = THIS_MODULE, .open = ref_tracker_debugfs_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /** * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir * @dir: ref_tracker_dir to be associated with debugfs file * * In most cases, a debugfs file will be created automatically for every * ref_tracker_dir. If the object was created before debugfs is brought up * then that may fail. In those cases, it is safe to call this at a later * time to create the file. */ void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir) { char name[NAME_MAX + 1]; struct dentry *dentry; int ret; /* No-op if already created */ dentry = xa_load(&debugfs_dentries, (unsigned long)dir); if (dentry && !xa_is_err(dentry)) return; ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { dentry = debugfs_create_file(name, S_IFREG | 0400, ref_tracker_debug_dir, dir, &ref_tracker_debugfs_fops); if (!IS_ERR(dentry)) { void *old; old = xa_store_irq(&debugfs_dentries, (unsigned long)dir, dentry, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(dentry); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_debugfs); void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...) { char name[NAME_MAX + 1]; struct dentry *symlink, *dentry; va_list args; int ret; symlink = xa_load(&debugfs_symlinks, (unsigned long)dir); dentry = xa_load(&debugfs_dentries, (unsigned long)dir); /* Already created?*/ if (symlink && !xa_is_err(symlink)) return; if (!dentry || xa_is_err(dentry)) return; va_start(args, fmt); ret = vsnprintf(name, sizeof(name), fmt, args); va_end(args); name[sizeof(name) - 1] = '\0'; if (ret < sizeof(name)) { symlink = debugfs_create_symlink(name, ref_tracker_debug_dir, dentry->d_name.name); if (!IS_ERR(symlink)) { void *old; old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir, symlink, GFP_KERNEL); if (xa_is_err(old)) debugfs_remove(symlink); else WARN_ON_ONCE(old); } } } EXPORT_SYMBOL(ref_tracker_dir_symlink); static void debugfs_reap_work(struct work_struct *work) { struct dentry *dentry; unsigned long index; bool reaped; do { reaped = false; xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_symlinks, index); debugfs_remove(dentry); reaped = true; } xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) { xa_erase_irq(&debugfs_dentries, index); debugfs_remove(dentry); reaped = true; } } while (reaped); } static int __init ref_tracker_debugfs_postcore_init(void) { INIT_WORK(&debugfs_reap_worker, debugfs_reap_work); xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ); xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ); return 0; } postcore_initcall(ref_tracker_debugfs_postcore_init); static int __init ref_tracker_debugfs_late_init(void) { ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL); return 0; } late_initcall(ref_tracker_debugfs_late_init); #endif /* CONFIG_DEBUG_FS */ |
| 3 2 3 2 3 3 3 2 2 2 1 3 1 1 2 2 1 4 3 3 3 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP Westwood+: end-to-end bandwidth estimation for TCP * * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 * * Support at http://c3lab.poliba.it/index.php/Westwood * Main references in literature: * * - Mascolo S, Casetti, M. Gerla et al. * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 * * - A. Grieco, s. Mascolo * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer * Comm. Review, 2004 * * - A. Dell'Aera, L. Grieco, S. Mascolo. * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 * * Westwood+ employs end-to-end bandwidth measurement to set cwnd and * ssthresh after packet loss. The probing phase is as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> /* TCP Westwood structure */ struct westwood { u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ u32 bw_est; /* bandwidth estimate */ u32 rtt_win_sx; /* here starts a new evaluation... */ u32 bk; u32 snd_una; /* used for evaluating the number of acked bytes */ u32 cumul_ack; u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; /* TCP Westwood functions and constants */ #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ /* * @tcp_westwood_create * This function initializes fields used in TCP Westwood+, * it is called after the initial SYN, so the sequence numbers * are correct but new passive connections we have no * information about RTTmin at this time so we simply set it to * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative * since in this way we're sure it will be updated in a consistent * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ static void tcp_westwood_init(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_jiffies32; w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 1; } /* * @westwood_do_filter * Low-pass filter. Implemented using constant coefficients. */ static inline u32 westwood_do_filter(u32 a, u32 b) { return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) { /* If the filter is empty fill it with the first sample of bandwidth */ if (w->bw_ns_est == 0 && w->bw_est == 0) { w->bw_ns_est = w->bk / delta; w->bw_est = w->bw_ns_est; } else { w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); } } /* * @westwood_pkts_acked * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ static void tcp_westwood_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct westwood *w = inet_csk_ca(sk); if (sample->rtt_us > 0) w->rtt = usecs_to_jiffies(sample->rtt_us); } /* * @westwood_update_window * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ static void westwood_update_window(struct sock *sk) { struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_jiffies32 - w->rtt_win_sx; /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ if (w->first_ack) { w->snd_una = tcp_sk(sk)->snd_una; w->first_ack = 0; } /* * See if a RTT-window has passed. * Be careful since if RTT is less than * 50ms we don't filter but we continue 'building the sample'. * This minimum limit was chosen since an estimation on small * time intervals is better to avoid... * Obviously on a LAN we reasonably will always have * right_bound = left_bound + WESTWOOD_RTT_MIN */ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { westwood_filter(w, delta); w->bk = 0; w->rtt_win_sx = tcp_jiffies32; } } static inline void update_rtt_min(struct westwood *w) { if (w->reset_rtt_min) { w->rtt_min = w->rtt; w->reset_rtt_min = 0; } else w->rtt_min = min(w->rtt, w->rtt_min); } /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ static inline void westwood_fast_bw(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; update_rtt_min(w); } /* * @westwood_acked_count * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ static inline u32 westwood_acked_count(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; /* If cumul_ack is 0 this is a dupack since it's not moving * tp->snd_una. */ if (!w->cumul_ack) { w->accounted += tp->mss_cache; w->cumul_ack = tp->mss_cache; } if (w->cumul_ack > tp->mss_cache) { /* Partial or delayed ack */ if (w->accounted >= w->cumul_ack) { w->accounted -= w->cumul_ack; w->cumul_ack = tp->mss_cache; } else { w->cumul_ack -= w->accounted; w->accounted = 0; } } w->snd_una = tp->snd_una; return w->cumul_ack; } /* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_ack(struct sock *sk, u32 ack_flags) { if (ack_flags & CA_ACK_SLOWPATH) { struct westwood *w = inet_csk_ca(sk); westwood_update_window(sk); w->bk += westwood_acked_count(sk); update_rtt_min(w); return; } westwood_fast_bw(sk); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); switch (event) { case CA_EVENT_COMPLETE_CWR: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); tcp_snd_cwnd_set(tp, tp->snd_ssthresh); break; case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); /* Update RTT_min when next ack arrives */ w->reset_rtt_min = 1; break; default: /* don't care */ break; } } /* Extract info for Tcp socket info provided via netlink. */ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct westwood *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = 0; info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt); info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); } return 0; } static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .cwnd_event = tcp_westwood_event, .in_ack_event = tcp_westwood_ack, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, .owner = THIS_MODULE, .name = "westwood" }; static int __init tcp_westwood_register(void) { BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } static void __exit tcp_westwood_unregister(void) { tcp_unregister_congestion_control(&tcp_westwood); } module_init(tcp_westwood_register); module_exit(tcp_westwood_unregister); MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Westwood+"); |
| 17 5 5 5 61 2 16 8 15 15 64 6 106 57 47 11 11 52 55 4 4 8 8 8 8 3 4 5 2 119 7 6 19 5 5 5 5 3 3 2 1 2 8 2 1 11 11 8 2 7 7 7 61 7 11 45 43 38 43 4 60 104 104 6 10 68 24 74 17 104 21 16 11 5 6 11 15 7 19 2 5 12 104 94 30 20 31 17 130 109 96 96 4 1 124 125 124 3 2 2 16 4 12 1 1 1 73 73 73 93 93 38 91 403 403 29 28 1 1 2 2 1 29 37 109 400 91 91 39 60 3 11 75 100 100 98 5 141 133 1 8 9 130 66 62 81 44 101 20 92 21 2 4 25 3 2 2 1 1 16 8 8 2 14 10 97 97 3 17 82 8 86 4 3 85 24 76 3 1 41 38 70 9 62 17 56 2 21 5 72 3 72 3 7 73 73 3 9 59 93 89 7 93 93 17 7 1 10 2 14 3 14 3 7 1 17 8 6 4 329 40 19 39 394 2 1 2 377 40 40 40 3 1 394 288 135 4688 4669 330 185 31 15 9 7 21 11 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; int bc_cutoff; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ); DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ eth_hw_addr_set(vlan->dev, addr); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = get_unaligned((u32 *)(addr + 2)); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_multicast_rx(const struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); macvlan_multicast_rx(port, src, skb); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); queue_work(system_dfl_wq, &port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (entry->vlan == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { if (entry->vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, entry->vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->bc_filter)) macvlan_broadcast_enqueue(port, src, skb); else if (test_bit(hash, port->mc_filter)) macvlan_multicast_rx(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, const unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr_storage *addr = p; if (!is_valid_ether_addr(addr->__data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->__data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->__data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->__data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan, int cutoff) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); struct netdev_hw_addr *ha; bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { if (!vlan && ha->synced <= cutoff) continue; __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan) { if (vlan->port->bc_cutoff < 0) { bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ); return; } macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL, vlan->port->bc_cutoff); } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL, 0); macvlan_recompute_bc_filter(vlan); } static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff) { if (vlan->port->bc_cutoff == cutoff) return; vlan->port->bc_cutoff = cutoff; macvlan_recompute_bc_filter(vlan); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int macvlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return generic_hwtstamp_get_lower(real_dev, cfg); } static int macvlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = macvlan_dev_real_dev(dev); if (!net_eq(dev_net(dev), &init_net)) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; /* Get macvlan's reference to lowerdev */ netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return ethtool_get_ts_info_by_layer(real_dev, info); } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return READ_ONCE(vlan->lowerdev->ifindex); } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_hwtstamp_get = macvlan_hwtstamp_get, .ndo_hwtstamp_set = macvlan_hwtstamp_set, }; static void macvlan_dev_free(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ netdev_put(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->change_proto_down = true; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macvlan_dev_free; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; port->bc_cutoff = 1; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr_storage ss; ss.ss_family = port->dev->type; memcpy(&ss.__data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &ss, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct macvlan_dev *vlan = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct net_device *lowerdev; struct macvlan_port *port; bool create = false; int macmode; int err; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (create && macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); macvlan_port_destroy(port->dev); } return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { return macvlan_common_newlink(dev, params, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (entry->vlan != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; if (port->bc_cutoff != 1 && nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan"); |
| 571 571 646 36 610 1810 293 1570 1 1 1 1 1 1 571 571 571 571 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/stat.h> #include <linux/proc_fs.h> #include <net/ip_vs.h> /* * IPVS protocols can only be registered/unregistered when the ipvs * module is loaded/unloaded, so no lock is needed in accessing the * ipvs protocol table. */ #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; /* States for conn templates: NONE or words separated with ",", max 15 chars */ static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { [IP_VS_CTPL_S_NONE] = "NONE", [IP_VS_CTPL_S_ASSURED] = "ASSURED", }; /* * register an ipvs protocol */ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp->next = ip_vs_proto_table[hash]; ip_vs_proto_table[hash] = pp; if (pp->init != NULL) pp->init(pp); return 0; } /* * register an ipvs protocols netns related data */ static int register_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_protocol *pp) { unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); struct ip_vs_proto_data *pd = kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL); if (!pd) return -ENOMEM; pd->pp = pp; /* For speed issues */ pd->next = ipvs->proto_data_table[hash]; ipvs->proto_data_table[hash] = pd; atomic_set(&pd->appcnt, 0); /* Init app counter */ if (pp->init_netns != NULL) { int ret = pp->init_netns(ipvs, pd); if (ret) { /* unlink an free proto data */ ipvs->proto_data_table[hash] = pd->next; kfree(pd); return ret; } } return 0; } /* * unregister an ipvs protocol */ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) { struct ip_vs_protocol **pp_p; unsigned int hash = IP_VS_PROTO_HASH(pp->protocol); pp_p = &ip_vs_proto_table[hash]; for (; *pp_p; pp_p = &(*pp_p)->next) { if (*pp_p == pp) { *pp_p = pp->next; if (pp->exit != NULL) pp->exit(pp); return 0; } } return -ESRCH; } /* * unregister an ipvs protocols netns data */ static int unregister_ip_vs_proto_netns(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { struct ip_vs_proto_data **pd_p; unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol); pd_p = &ipvs->proto_data_table[hash]; for (; *pd_p; pd_p = &(*pd_p)->next) { if (*pd_p == pd) { *pd_p = pd->next; if (pd->pp->exit_netns != NULL) pd->pp->exit_netns(ipvs, pd); kfree(pd); return 0; } } return -ESRCH; } /* * get ip_vs_protocol object by its proto. */ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) { struct ip_vs_protocol *pp; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { if (pp->protocol == proto) return pp; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_get); /* * get ip_vs_protocol object data by netns and proto */ struct ip_vs_proto_data * ip_vs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) { struct ip_vs_proto_data *pd; unsigned int hash = IP_VS_PROTO_HASH(proto); for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { if (pd->pp->protocol == proto) return pd; } return NULL; } EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { if (pd->pp->timeout_change) pd->pp->timeout_change(pd, flags); } } } int * ip_vs_create_timeout_table(int *table, int size) { return kmemdup(table, size, GFP_KERNEL); } const char *ip_vs_state_name(const struct ip_vs_conn *cp) { unsigned int state = cp->state; struct ip_vs_protocol *pp; if (cp->flags & IP_VS_CONN_F_TEMPLATE) { if (state >= IP_VS_CTPL_S_LAST) return "ERR!"; return ip_vs_ctpl_state_name_table[state] ? : "?"; } pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; return pp->state_name(state); } static void ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[128]; struct iphdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI4->%pI4", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI4:%u->%pI4:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #ifdef CONFIG_IP_VS_IPV6 static void ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { char buf[192]; struct ipv6hdr _iph, *ih; ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "TRUNCATED"); else if (ih->nexthdr == IPPROTO_FRAGMENT) sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr); else { __be16 _ports[2], *pptr; pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), sizeof(_ports), _ports); if (pptr == NULL) sprintf(buf, "TRUNCATED %pI6c->%pI6c", &ih->saddr, &ih->daddr); else sprintf(buf, "%pI6c:%u->%pI6c:%u", &ih->saddr, ntohs(pptr[0]), &ih->daddr, ntohs(pptr[1])); } pr_debug("%s: %s %s\n", msg, pp->name, buf); } #endif void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } /* * per network name-space init */ int __net_init ip_vs_protocol_net_init(struct netns_ipvs *ipvs) { int i, ret; static struct ip_vs_protocol *protos[] = { #ifdef CONFIG_IP_VS_PROTO_TCP &ip_vs_protocol_tcp, #endif #ifdef CONFIG_IP_VS_PROTO_UDP &ip_vs_protocol_udp, #endif #ifdef CONFIG_IP_VS_PROTO_SCTP &ip_vs_protocol_sctp, #endif #ifdef CONFIG_IP_VS_PROTO_AH &ip_vs_protocol_ah, #endif #ifdef CONFIG_IP_VS_PROTO_ESP &ip_vs_protocol_esp, #endif }; for (i = 0; i < ARRAY_SIZE(protos); i++) { ret = register_ip_vs_proto_netns(ipvs, protos[i]); if (ret < 0) goto cleanup; } return 0; cleanup: ip_vs_protocol_net_cleanup(ipvs); return ret; } void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_proto_data *pd; int i; /* unregister all the ipvs proto data for this netns */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pd = ipvs->proto_data_table[i]) != NULL) unregister_ip_vs_proto_netns(ipvs, pd); } } int __init ip_vs_protocol_init(void) { char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ strcat(protocols, ", "); \ strcat(protocols, (p)->name); \ } while (0) #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif #ifdef CONFIG_IP_VS_PROTO_UDP REGISTER_PROTOCOL(&ip_vs_protocol_udp); #endif #ifdef CONFIG_IP_VS_PROTO_SCTP REGISTER_PROTOCOL(&ip_vs_protocol_sctp); #endif #ifdef CONFIG_IP_VS_PROTO_AH REGISTER_PROTOCOL(&ip_vs_protocol_ah); #endif #ifdef CONFIG_IP_VS_PROTO_ESP REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); return 0; } void ip_vs_protocol_cleanup(void) { struct ip_vs_protocol *pp; int i; /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) unregister_ip_vs_protocol(pp); } } |
| 23 8 4 551 1 12 278 2 264 5 3 167 391 24 305 233 212 15 2 40 2 155 110 84 3 1 185 5 2 1 3 2 6 7 1 3 3 1 4 8 2 1 3 9 4 3 1 1 1 6 2 1 4 4 2 1 12 23 5 1 20 86 1 6 1 1 79 5 4 1 1 1 3 1 69 5 1 2 54 8 1 1 53 6 2 2 21 26 2 20 5 2 9 5 11 14 10 3 3 88 80 9 6 82 52 83 10 32 43 43 5 48 11 73 1 1 7 1 5 2 3 4 1 1 2 20 10 9 1 1 9 2 2 5 3 1 1 3 3 1 1 1 7 7 1 1 5 2 12 2 5 1 1 8 3 8 12 1 8 8 161 8 1 3 138 2 14 1 10 6 1 609 8 3 11 6 13 5 6 14 23 69 26 88 9 4 1 14 5 3 8 12 9 20 2 4 161 1 6 5 85 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 | // SPDX-License-Identifier: GPL-2.0-or-later /* Userspace key control operations * * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/capability.h> #include <linux/cred.h> #include <linux/string.h> #include <linux/err.h> #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <keys/request_key_auth-type.h> #include "internal.h" #define KEY_MAX_DESC_SIZE 4096 static const unsigned char keyrings_capabilities[2] = { [0] = (KEYCTL_CAPS0_CAPABILITIES | (IS_ENABLED(CONFIG_PERSISTENT_KEYRINGS) ? KEYCTL_CAPS0_PERSISTENT_KEYRINGS : 0) | (IS_ENABLED(CONFIG_KEY_DH_OPERATIONS) ? KEYCTL_CAPS0_DIFFIE_HELLMAN : 0) | (IS_ENABLED(CONFIG_ASYMMETRIC_KEY_TYPE) ? KEYCTL_CAPS0_PUBLIC_KEY : 0) | (IS_ENABLED(CONFIG_BIG_KEYS) ? KEYCTL_CAPS0_BIG_KEY : 0) | KEYCTL_CAPS0_INVALIDATE | KEYCTL_CAPS0_RESTRICT_KEYRING | KEYCTL_CAPS0_MOVE ), [1] = (KEYCTL_CAPS1_NS_KEYRING_NAME | KEYCTL_CAPS1_NS_KEY_TAG | (IS_ENABLED(CONFIG_KEY_NOTIFICATIONS) ? KEYCTL_CAPS1_NOTIFICATIONS : 0) ), }; static int key_get_type_from_user(char *type, const char __user *_type, unsigned len) { int ret; ret = strncpy_from_user(type, _type, len); if (ret < 0) return ret; if (ret == 0 || ret >= len) return -EINVAL; if (type[0] == '.') return -EPERM; type[len - 1] = '\0'; return 0; } /* * Extract the description of a new key from userspace and either add it as a * new key to the specified keyring or update a matching key in that keyring. * * If the description is NULL or an empty string, the key type is asked to * generate one from the payload. * * The keyring must be writable so that we can attach the key to it. * * If successful, the new key's serial number is returned, otherwise an error * code is returned. */ SYSCALL_DEFINE5(add_key, const char __user *, _type, const char __user *, _description, const void __user *, _payload, size_t, plen, key_serial_t, ringid) { key_ref_t keyring_ref, key_ref; char type[32], *description; void *payload; long ret; ret = -EINVAL; if (plen > 1024 * 1024 - 1) goto error; /* draw all the data into kernel space */ ret = key_get_type_from_user(type, _type, sizeof(type)); if (ret < 0) goto error; description = NULL; if (_description) { description = strndup_user(_description, KEY_MAX_DESC_SIZE); if (IS_ERR(description)) { ret = PTR_ERR(description); goto error; } if (!*description) { kfree(description); description = NULL; } else if ((description[0] == '.') && (strncmp(type, "keyring", 7) == 0)) { ret = -EPERM; goto error2; } } /* pull the payload in if one was supplied */ payload = NULL; if (plen) { ret = -ENOMEM; payload = kvmalloc(plen, GFP_KERNEL); if (!payload) goto error2; ret = -EFAULT; if (copy_from_user(payload, _payload, plen) != 0) goto error3; } /* find the target keyring (which must be writable) */ keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error3; } /* create or update the requested key and add it to the target * keyring */ key_ref = key_create_or_update(keyring_ref, type, description, payload, plen, KEY_PERM_UNDEF, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key_ref)) { ret = key_ref_to_ptr(key_ref)->serial; key_ref_put(key_ref); } else { ret = PTR_ERR(key_ref); } key_ref_put(keyring_ref); error3: kvfree_sensitive(payload, plen); error2: kfree(description); error: return ret; } /* * Search the process keyrings and keyring trees linked from those for a * matching key. Keyrings must have appropriate Search permission to be * searched. * * If a key is found, it will be attached to the destination keyring if there's * one specified and the serial number of the key will be returned. * * If no key is found, /sbin/request-key will be invoked if _callout_info is * non-NULL in an attempt to create a key. The _callout_info string will be * passed to /sbin/request-key to aid with completing the request. If the * _callout_info string is "" then it will be changed to "-". */ SYSCALL_DEFINE4(request_key, const char __user *, _type, const char __user *, _description, const char __user *, _callout_info, key_serial_t, destringid) { struct key_type *ktype; struct key *key; key_ref_t dest_ref; size_t callout_len; char type[32], *description, *callout_info; long ret; /* pull the type into kernel space */ ret = key_get_type_from_user(type, _type, sizeof(type)); if (ret < 0) goto error; /* pull the description into kernel space */ description = strndup_user(_description, KEY_MAX_DESC_SIZE); if (IS_ERR(description)) { ret = PTR_ERR(description); goto error; } /* pull the callout info into kernel space */ callout_info = NULL; callout_len = 0; if (_callout_info) { callout_info = strndup_user(_callout_info, PAGE_SIZE); if (IS_ERR(callout_info)) { ret = PTR_ERR(callout_info); goto error2; } callout_len = strlen(callout_info); } /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; } } /* find the key type */ ktype = key_type_lookup(type); if (IS_ERR(ktype)) { ret = PTR_ERR(ktype); goto error4; } /* do the search */ key = request_key_and_link(ktype, description, NULL, callout_info, callout_len, NULL, key_ref_to_ptr(dest_ref), KEY_ALLOC_IN_QUOTA); if (IS_ERR(key)) { ret = PTR_ERR(key); goto error5; } /* wait for the key to finish being constructed */ ret = wait_for_key_construction(key, 1); if (ret < 0) goto error6; ret = key->serial; error6: key_put(key); error5: key_type_put(ktype); error4: key_ref_put(dest_ref); error3: kfree(callout_info); error2: kfree(description); error: return ret; } /* * Get the ID of the specified process keyring. * * The requested keyring must have search permission to be found. * * If successful, the ID of the requested keyring will be returned. */ long keyctl_get_keyring_ID(key_serial_t id, int create) { key_ref_t key_ref; unsigned long lflags; long ret; lflags = create ? KEY_LOOKUP_CREATE : 0; key_ref = lookup_user_key(id, lflags, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; } ret = key_ref_to_ptr(key_ref)->serial; key_ref_put(key_ref); error: return ret; } /* * Join a (named) session keyring. * * Create and join an anonymous session keyring or join a named session * keyring, creating it if necessary. A named session keyring must have Search * permission for it to be joined. Session keyrings without this permit will * be skipped over. It is not permitted for userspace to create or join * keyrings whose name begin with a dot. * * If successful, the ID of the joined session keyring will be returned. */ long keyctl_join_session_keyring(const char __user *_name) { char *name; long ret; /* fetch the name from userspace */ name = NULL; if (_name) { name = strndup_user(_name, KEY_MAX_DESC_SIZE); if (IS_ERR(name)) { ret = PTR_ERR(name); goto error; } ret = -EPERM; if (name[0] == '.') goto error_name; } /* join the session */ ret = join_session_keyring(name); error_name: kfree(name); error: return ret; } /* * Update a key's data payload from the given data. * * The key must grant the caller Write permission and the key type must support * updating for this to work. A negative key can be positively instantiated * with this call. * * If successful, 0 will be returned. If the key type does not support * updating, then -EOPNOTSUPP will be returned. */ long keyctl_update_key(key_serial_t id, const void __user *_payload, size_t plen) { key_ref_t key_ref; void *payload; long ret; ret = -EINVAL; if (plen > PAGE_SIZE) goto error; /* pull the payload in if one was supplied */ payload = NULL; if (plen) { ret = -ENOMEM; payload = kvmalloc(plen, GFP_KERNEL); if (!payload) goto error; ret = -EFAULT; if (copy_from_user(payload, _payload, plen) != 0) goto error2; } /* find the target key (which must be writable) */ key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; } /* update the key */ ret = key_update(key_ref, payload, plen); key_ref_put(key_ref); error2: kvfree_sensitive(payload, plen); error: return ret; } /* * Revoke a key. * * The key must be grant the caller Write or Setattr permission for this to * work. The key type should give up its quota claim when revoked. The key * and any links to the key will be automatically garbage collected after a * certain amount of time (/proc/sys/kernel/keys/gc_delay). * * Keys with KEY_FLAG_KEEP set should not be revoked. * * If successful, 0 is returned. */ long keyctl_revoke_key(key_serial_t id) { key_ref_t key_ref; struct key *key; long ret; key_ref = lookup_user_key(id, 0, KEY_NEED_WRITE); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); if (ret != -EACCES) goto error; key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; } } key = key_ref_to_ptr(key_ref); ret = 0; if (test_bit(KEY_FLAG_KEEP, &key->flags)) ret = -EPERM; else key_revoke(key); key_ref_put(key_ref); error: return ret; } /* * Invalidate a key. * * The key must be grant the caller Invalidate permission for this to work. * The key and any links to the key will be automatically garbage collected * immediately. * * Keys with KEY_FLAG_KEEP set should not be invalidated. * * If successful, 0 is returned. */ long keyctl_invalidate_key(key_serial_t id) { key_ref_t key_ref; struct key *key; long ret; kenter("%d", id); key_ref = lookup_user_key(id, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); /* Root is permitted to invalidate certain special keys */ if (capable(CAP_SYS_ADMIN)) { key_ref = lookup_user_key(id, 0, KEY_SYSADMIN_OVERRIDE); if (IS_ERR(key_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_INVAL, &key_ref_to_ptr(key_ref)->flags)) goto invalidate; goto error_put; } goto error; } invalidate: key = key_ref_to_ptr(key_ref); ret = 0; if (test_bit(KEY_FLAG_KEEP, &key->flags)) ret = -EPERM; else key_invalidate(key); error_put: key_ref_put(key_ref); error: kleave(" = %ld", ret); return ret; } /* * Clear the specified keyring, creating an empty process keyring if one of the * special keyring IDs is used. * * The keyring must grant the caller Write permission and not have * KEY_FLAG_KEEP set for this to work. If successful, 0 will be returned. */ long keyctl_keyring_clear(key_serial_t ringid) { key_ref_t keyring_ref; struct key *keyring; long ret; keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); /* Root is permitted to invalidate certain special keyrings */ if (capable(CAP_SYS_ADMIN)) { keyring_ref = lookup_user_key(ringid, 0, KEY_SYSADMIN_OVERRIDE); if (IS_ERR(keyring_ref)) goto error; if (test_bit(KEY_FLAG_ROOT_CAN_CLEAR, &key_ref_to_ptr(keyring_ref)->flags)) goto clear; goto error_put; } goto error; } clear: keyring = key_ref_to_ptr(keyring_ref); if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) ret = -EPERM; else ret = keyring_clear(keyring); error_put: key_ref_put(keyring_ref); error: return ret; } /* * Create a link from a keyring to a key if there's no matching key in the * keyring, otherwise replace the link to the matching key with a link to the * new key. * * The key must grant the caller Link permission and the keyring must grant * the caller Write permission. Furthermore, if an additional link is created, * the keyring's quota will be extended. * * If successful, 0 will be returned. */ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid) { key_ref_t keyring_ref, key_ref; long ret; keyring_ref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; } ret = key_link(key_ref_to_ptr(keyring_ref), key_ref_to_ptr(key_ref)); key_ref_put(key_ref); error2: key_ref_put(keyring_ref); error: return ret; } /* * Unlink a key from a keyring. * * The keyring must grant the caller Write permission for this to work; the key * itself need not grant the caller anything. If the last link to a key is * removed then that key will be scheduled for destruction. * * Keys or keyrings with KEY_FLAG_KEEP set should not be unlinked. * * If successful, 0 will be returned. */ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid) { key_ref_t keyring_ref, key_ref; struct key *keyring, *key; long ret; keyring_ref = lookup_user_key(ringid, 0, KEY_NEED_WRITE); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error; } key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_NEED_UNLINK); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error2; } keyring = key_ref_to_ptr(keyring_ref); key = key_ref_to_ptr(key_ref); if (test_bit(KEY_FLAG_KEEP, &keyring->flags) && test_bit(KEY_FLAG_KEEP, &key->flags)) ret = -EPERM; else ret = key_unlink(keyring, key); key_ref_put(key_ref); error2: key_ref_put(keyring_ref); error: return ret; } /* * Move a link to a key from one keyring to another, displacing any matching * key from the destination keyring. * * The key must grant the caller Link permission and both keyrings must grant * the caller Write permission. There must also be a link in the from keyring * to the key. If both keyrings are the same, nothing is done. * * If successful, 0 will be returned. */ long keyctl_keyring_move(key_serial_t id, key_serial_t from_ringid, key_serial_t to_ringid, unsigned int flags) { key_ref_t key_ref, from_ref, to_ref; long ret; if (flags & ~KEYCTL_MOVE_EXCL) return -EINVAL; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_LINK); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); from_ref = lookup_user_key(from_ringid, 0, KEY_NEED_WRITE); if (IS_ERR(from_ref)) { ret = PTR_ERR(from_ref); goto error2; } to_ref = lookup_user_key(to_ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(to_ref)) { ret = PTR_ERR(to_ref); goto error3; } ret = key_move(key_ref_to_ptr(key_ref), key_ref_to_ptr(from_ref), key_ref_to_ptr(to_ref), flags); key_ref_put(to_ref); error3: key_ref_put(from_ref); error2: key_ref_put(key_ref); return ret; } /* * Return a description of a key to userspace. * * The key must grant the caller View permission for this to work. * * If there's a buffer, we place up to buflen bytes of data into it formatted * in the following way: * * type;uid;gid;perm;description<NUL> * * If successful, we return the amount of description available, irrespective * of how much we may have copied into the buffer. */ long keyctl_describe_key(key_serial_t keyid, char __user *buffer, size_t buflen) { struct key *key, *instkey; key_ref_t key_ref; char *infobuf; long ret; int desclen, infolen; key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_VIEW); if (IS_ERR(key_ref)) { /* viewing a key under construction is permitted if we have the * authorisation token handy */ if (PTR_ERR(key_ref) == -EACCES) { instkey = key_get_instantiation_authkey(keyid); if (!IS_ERR(instkey)) { key_put(instkey); key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } } ret = PTR_ERR(key_ref); goto error; } okay: key = key_ref_to_ptr(key_ref); desclen = strlen(key->description); /* calculate how much information we're going to return */ ret = -ENOMEM; infobuf = kasprintf(GFP_KERNEL, "%s;%d;%d;%08x;", key->type->name, from_kuid_munged(current_user_ns(), key->uid), from_kgid_munged(current_user_ns(), key->gid), key->perm); if (!infobuf) goto error2; infolen = strlen(infobuf); ret = infolen + desclen + 1; /* consider returning the data */ if (buffer && buflen >= ret) { if (copy_to_user(buffer, infobuf, infolen) != 0 || copy_to_user(buffer + infolen, key->description, desclen + 1) != 0) ret = -EFAULT; } kfree(infobuf); error2: key_ref_put(key_ref); error: return ret; } /* * Search the specified keyring and any keyrings it links to for a matching * key. Only keyrings that grant the caller Search permission will be searched * (this includes the starting keyring). Only keys with Search permission can * be found. * * If successful, the found key will be linked to the destination keyring if * supplied and the key has Link permission, and the found key ID will be * returned. */ long keyctl_keyring_search(key_serial_t ringid, const char __user *_type, const char __user *_description, key_serial_t destringid) { struct key_type *ktype; key_ref_t keyring_ref, key_ref, dest_ref; char type[32], *description; long ret; /* pull the type and description into kernel space */ ret = key_get_type_from_user(type, _type, sizeof(type)); if (ret < 0) goto error; description = strndup_user(_description, KEY_MAX_DESC_SIZE); if (IS_ERR(description)) { ret = PTR_ERR(description); goto error; } /* get the keyring at which to begin the search */ keyring_ref = lookup_user_key(ringid, 0, KEY_NEED_SEARCH); if (IS_ERR(keyring_ref)) { ret = PTR_ERR(keyring_ref); goto error2; } /* get the destination keyring if specified */ dest_ref = NULL; if (destringid) { dest_ref = lookup_user_key(destringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(dest_ref)) { ret = PTR_ERR(dest_ref); goto error3; } } /* find the key type */ ktype = key_type_lookup(type); if (IS_ERR(ktype)) { ret = PTR_ERR(ktype); goto error4; } /* do the search */ key_ref = keyring_search(keyring_ref, ktype, description, true); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); /* treat lack or presence of a negative key the same */ if (ret == -EAGAIN) ret = -ENOKEY; goto error5; } /* link the resulting key to the destination keyring if we can */ if (dest_ref) { ret = key_permission(key_ref, KEY_NEED_LINK); if (ret < 0) goto error6; ret = key_link(key_ref_to_ptr(dest_ref), key_ref_to_ptr(key_ref)); if (ret < 0) goto error6; } ret = key_ref_to_ptr(key_ref)->serial; error6: key_ref_put(key_ref); error5: key_type_put(ktype); error4: key_ref_put(dest_ref); error3: key_ref_put(keyring_ref); error2: kfree(description); error: return ret; } /* * Call the read method */ static long __keyctl_read_key(struct key *key, char *buffer, size_t buflen) { long ret; down_read(&key->sem); ret = key_validate(key); if (ret == 0) ret = key->type->read(key, buffer, buflen); up_read(&key->sem); return ret; } /* * Read a key's payload. * * The key must either grant the caller Read permission, or it must grant the * caller Search permission when searched for from the process keyrings. * * If successful, we place up to buflen bytes of data into the buffer, if one * is provided, and return the amount of data that is available in the key, * irrespective of how much we copied into the buffer. */ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) { struct key *key; key_ref_t key_ref; long ret; char *key_data = NULL; size_t key_data_len; /* find the key first */ key_ref = lookup_user_key(keyid, 0, KEY_DEFER_PERM_CHECK); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto out; } key = key_ref_to_ptr(key_ref); ret = key_read_state(key); if (ret < 0) goto key_put_out; /* Negatively instantiated */ /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) goto can_read_key; if (ret != -EACCES) goto key_put_out; /* we can't; see if it's searchable from this process's keyrings * - we automatically take account of the fact that it may be * dangling off an instantiation key */ if (!is_key_possessed(key_ref)) { ret = -EACCES; goto key_put_out; } /* the key is probably readable - now try to read it */ can_read_key: if (!key->type->read) { ret = -EOPNOTSUPP; goto key_put_out; } if (!buffer || !buflen) { /* Get the key length from the read method */ ret = __keyctl_read_key(key, NULL, 0); goto key_put_out; } /* * Read the data with the semaphore held (since we might sleep) * to protect against the key being updated or revoked. * * Allocating a temporary buffer to hold the keys before * transferring them to user buffer to avoid potential * deadlock involving page fault and mmap_lock. * * key_data_len = (buflen <= PAGE_SIZE) * ? buflen : actual length of key data * * This prevents allocating arbitrary large buffer which can * be much larger than the actual key length. In the latter case, * at least 2 passes of this loop is required. */ key_data_len = (buflen <= PAGE_SIZE) ? buflen : 0; for (;;) { if (key_data_len) { key_data = kvmalloc(key_data_len, GFP_KERNEL); if (!key_data) { ret = -ENOMEM; goto key_put_out; } } ret = __keyctl_read_key(key, key_data, key_data_len); /* * Read methods will just return the required length without * any copying if the provided length isn't large enough. */ if (ret <= 0 || ret > buflen) break; /* * The key may change (unlikely) in between 2 consecutive * __keyctl_read_key() calls. In this case, we reallocate * a larger buffer and redo the key read when * key_data_len < ret <= buflen. */ if (ret > key_data_len) { if (unlikely(key_data)) kvfree_sensitive(key_data, key_data_len); key_data_len = ret; continue; /* Allocate buffer */ } if (copy_to_user(buffer, key_data, ret)) ret = -EFAULT; break; } kvfree_sensitive(key_data, key_data_len); key_put_out: key_put(key); out: return ret; } /* * Change the ownership of a key * * The key must grant the caller Setattr permission for this to work, though * the key need not be fully instantiated yet. For the UID to be changed, or * for the GID to be changed to a group the caller is not a member of, the * caller must have sysadmin capability. If either uid or gid is -1 then that * attribute is not changed. * * If the UID is to be changed, the new user must have sufficient quota to * accept the key. The quota deduction will be removed from the old user to * the new user should the attribute be changed. * * If successful, 0 will be returned. */ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) { struct key_user *newowner, *zapowner = NULL; struct key *key; key_ref_t key_ref; long ret; kuid_t uid; kgid_t gid; unsigned long flags; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); ret = -EINVAL; if ((user != (uid_t) -1) && !uid_valid(uid)) goto error; if ((group != (gid_t) -1) && !gid_valid(gid)) goto error; ret = 0; if (user == (uid_t) -1 && group == (gid_t) -1) goto error; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; } key = key_ref_to_ptr(key_ref); /* make the changes with the locks held to prevent chown/chown races */ ret = -EACCES; down_write(&key->sem); { bool is_privileged_op = false; /* only the sysadmin can chown a key to some other UID */ if (user != (uid_t) -1 && !uid_eq(key->uid, uid)) is_privileged_op = true; /* only the sysadmin can set the key's GID to a group other * than one of those that the current process subscribes to */ if (group != (gid_t) -1 && !gid_eq(gid, key->gid) && !in_group_p(gid)) is_privileged_op = true; if (is_privileged_op && !capable(CAP_SYS_ADMIN)) goto error_put; } /* change the UID */ if (user != (uid_t) -1 && !uid_eq(uid, key->uid)) { ret = -ENOMEM; newowner = key_user_lookup(uid); if (!newowner) goto error_put; /* transfer the quota burden to the new user */ if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxkeys : key_quota_maxkeys; unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; spin_lock_irqsave(&newowner->lock, flags); if (newowner->qnkeys + 1 > maxkeys || newowner->qnbytes + key->quotalen > maxbytes || newowner->qnbytes + key->quotalen < newowner->qnbytes) goto quota_overrun; newowner->qnkeys++; newowner->qnbytes += key->quotalen; spin_unlock_irqrestore(&newowner->lock, flags); spin_lock_irqsave(&key->user->lock, flags); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; spin_unlock_irqrestore(&key->user->lock, flags); } atomic_dec(&key->user->nkeys); atomic_inc(&newowner->nkeys); if (key->state != KEY_IS_UNINSTANTIATED) { atomic_dec(&key->user->nikeys); atomic_inc(&newowner->nikeys); } zapowner = key->user; key->user = newowner; key->uid = uid; } /* change the GID */ if (group != (gid_t) -1) key->gid = gid; notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; error_put: up_write(&key->sem); key_put(key); if (zapowner) key_user_put(zapowner); error: return ret; quota_overrun: spin_unlock_irqrestore(&newowner->lock, flags); zapowner = newowner; ret = -EDQUOT; goto error_put; } /* * Change the permission mask on a key. * * The key must grant the caller Setattr permission for this to work, though * the key need not be fully instantiated yet. If the caller does not have * sysadmin capability, it may only change the permission on keys that it owns. */ long keyctl_setperm_key(key_serial_t id, key_perm_t perm) { struct key *key; key_ref_t key_ref; long ret; ret = -EINVAL; if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL)) goto error; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { ret = PTR_ERR(key_ref); goto error; } key = key_ref_to_ptr(key_ref); /* make the changes with the locks held to prevent chown/chmod races */ ret = -EACCES; down_write(&key->sem); /* if we're not the sysadmin, we can only change a key that we own */ if (uid_eq(key->uid, current_fsuid()) || capable(CAP_SYS_ADMIN)) { key->perm = perm; notify_key(key, NOTIFY_KEY_SETATTR, 0); ret = 0; } up_write(&key->sem); key_put(key); error: return ret; } /* * Get the destination keyring for instantiation and check that the caller has * Write permission on it. */ static long get_instantiation_keyring(key_serial_t ringid, struct request_key_auth *rka, struct key **_dest_keyring) { key_ref_t dkref; *_dest_keyring = NULL; /* just return a NULL pointer if we weren't asked to make a link */ if (ringid == 0) return 0; /* if a specific keyring is nominated by ID, then use that */ if (ringid > 0) { dkref = lookup_user_key(ringid, KEY_LOOKUP_CREATE, KEY_NEED_WRITE); if (IS_ERR(dkref)) return PTR_ERR(dkref); *_dest_keyring = key_ref_to_ptr(dkref); return 0; } if (ringid == KEY_SPEC_REQKEY_AUTH_KEY) return -EINVAL; /* otherwise specify the destination keyring recorded in the * authorisation key (any KEY_SPEC_*_KEYRING) */ if (ringid >= KEY_SPEC_REQUESTOR_KEYRING) { *_dest_keyring = key_get(rka->dest_keyring); return 0; } return -ENOKEY; } /* * Change the request_key authorisation key on the current process. */ static int keyctl_change_reqkey_auth(struct key *key) { struct cred *new; new = prepare_creds(); if (!new) return -ENOMEM; key_put(new->request_key_auth); new->request_key_auth = key_get(key); return commit_creds(new); } /* * Instantiate a key with the specified payload and link the key into the * destination keyring if one is given. * * The caller must have the appropriate instantiation permit set for this to * work (see keyctl_assume_authority). No other permissions are required. * * If successful, 0 will be returned. */ static long keyctl_instantiate_key_common(key_serial_t id, struct iov_iter *from, key_serial_t ringid) { const struct cred *cred = current_cred(); struct request_key_auth *rka; struct key *instkey, *dest_keyring; size_t plen = from ? iov_iter_count(from) : 0; void *payload; long ret; kenter("%d,,%zu,%d", id, plen, ringid); if (!plen) from = NULL; ret = -EINVAL; if (plen > 1024 * 1024 - 1) goto error; /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; instkey = cred->request_key_auth; if (!instkey) goto error; rka = instkey->payload.data[0]; if (rka->target_key->serial != id) goto error; /* pull the payload in if one was supplied */ payload = NULL; if (from) { ret = -ENOMEM; payload = kvmalloc(plen, GFP_KERNEL); if (!payload) goto error; ret = -EFAULT; if (!copy_from_iter_full(payload, plen, from)) goto error2; } /* find the destination keyring amongst those belonging to the * requesting task */ ret = get_instantiation_keyring(ringid, rka, &dest_keyring); if (ret < 0) goto error2; /* instantiate the key and link it into a keyring */ ret = key_instantiate_and_link(rka->target_key, payload, plen, dest_keyring, instkey); key_put(dest_keyring); /* discard the assumed authority if it's just been disabled by * instantiation of the key */ if (ret == 0) keyctl_change_reqkey_auth(NULL); error2: kvfree_sensitive(payload, plen); error: return ret; } /* * Instantiate a key with the specified payload and link the key into the * destination keyring if one is given. * * The caller must have the appropriate instantiation permit set for this to * work (see keyctl_assume_authority). No other permissions are required. * * If successful, 0 will be returned. */ long keyctl_instantiate_key(key_serial_t id, const void __user *_payload, size_t plen, key_serial_t ringid) { if (_payload && plen) { struct iov_iter from; int ret; ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen, &from); if (unlikely(ret)) return ret; return keyctl_instantiate_key_common(id, &from, ringid); } return keyctl_instantiate_key_common(id, NULL, ringid); } /* * Instantiate a key with the specified multipart payload and link the key into * the destination keyring if one is given. * * The caller must have the appropriate instantiation permit set for this to * work (see keyctl_assume_authority). No other permissions are required. * * If successful, 0 will be returned. */ long keyctl_instantiate_key_iov(key_serial_t id, const struct iovec __user *_payload_iov, unsigned ioc, key_serial_t ringid) { struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; struct iov_iter from; long ret; if (!_payload_iov) ioc = 0; ret = import_iovec(ITER_SOURCE, _payload_iov, ioc, ARRAY_SIZE(iovstack), &iov, &from); if (ret < 0) return ret; ret = keyctl_instantiate_key_common(id, &from, ringid); kfree(iov); return ret; } /* * Negatively instantiate the key with the given timeout (in seconds) and link * the key into the destination keyring if one is given. * * The caller must have the appropriate instantiation permit set for this to * work (see keyctl_assume_authority). No other permissions are required. * * The key and any links to the key will be automatically garbage collected * after the timeout expires. * * Negative keys are used to rate limit repeated request_key() calls by causing * them to return -ENOKEY until the negative key expires. * * If successful, 0 will be returned. */ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid) { return keyctl_reject_key(id, timeout, ENOKEY, ringid); } /* * Negatively instantiate the key with the given timeout (in seconds) and error * code and link the key into the destination keyring if one is given. * * The caller must have the appropriate instantiation permit set for this to * work (see keyctl_assume_authority). No other permissions are required. * * The key and any links to the key will be automatically garbage collected * after the timeout expires. * * Negative keys are used to rate limit repeated request_key() calls by causing * them to return the specified error code until the negative key expires. * * If successful, 0 will be returned. */ long keyctl_reject_key(key_serial_t id, unsigned timeout, unsigned error, key_serial_t ringid) { const struct cred *cred = current_cred(); struct request_key_auth *rka; struct key *instkey, *dest_keyring; long ret; kenter("%d,%u,%u,%d", id, timeout, error, ringid); /* must be a valid error code and mustn't be a kernel special */ if (error <= 0 || error >= MAX_ERRNO || error == ERESTARTSYS || error == ERESTARTNOINTR || error == ERESTARTNOHAND || error == ERESTART_RESTARTBLOCK) return -EINVAL; /* the appropriate instantiation authorisation key must have been * assumed before calling this */ ret = -EPERM; instkey = cred->request_key_auth; if (!instkey) goto error; rka = instkey->payload.data[0]; if (rka->target_key->serial != id) goto error; /* find the destination keyring if present (which must also be * writable) */ ret = get_instantiation_keyring(ringid, rka, &dest_keyring); if (ret < 0) goto error; /* instantiate the key and link it into a keyring */ ret = key_reject_and_link(rka->target_key, timeout, error, dest_keyring, instkey); key_put(dest_keyring); /* discard the assumed authority if it's just been disabled by * instantiation of the key */ if (ret == 0) keyctl_change_reqkey_auth(NULL); error: return ret; } /* * Read or set the default keyring in which request_key() will cache keys and * return the old setting. * * If a thread or process keyring is specified then it will be created if it * doesn't yet exist. The old setting will be returned if successful. */ long keyctl_set_reqkey_keyring(int reqkey_defl) { struct cred *new; int ret, old_setting; old_setting = current_cred_xxx(jit_keyring); if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE) return old_setting; new = prepare_creds(); if (!new) return -ENOMEM; switch (reqkey_defl) { case KEY_REQKEY_DEFL_THREAD_KEYRING: ret = install_thread_keyring_to_cred(new); if (ret < 0) goto error; goto set; case KEY_REQKEY_DEFL_PROCESS_KEYRING: ret = install_process_keyring_to_cred(new); if (ret < 0) goto error; goto set; case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_SESSION_KEYRING: case KEY_REQKEY_DEFL_USER_KEYRING: case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: goto set; case KEY_REQKEY_DEFL_NO_CHANGE: case KEY_REQKEY_DEFL_GROUP_KEYRING: default: ret = -EINVAL; goto error; } set: new->jit_keyring = reqkey_defl; commit_creds(new); return old_setting; error: abort_creds(new); return ret; } /* * Set or clear the timeout on a key. * * Either the key must grant the caller Setattr permission or else the caller * must hold an instantiation authorisation token for the key. * * The timeout is either 0 to clear the timeout, or a number of seconds from * the current time. The key and any links to the key will be automatically * garbage collected after the timeout expires. * * Keys with KEY_FLAG_KEEP set should not be timed out. * * If successful, 0 is returned. */ long keyctl_set_timeout(key_serial_t id, unsigned timeout) { struct key *key, *instkey; key_ref_t key_ref; long ret; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) { /* setting the timeout on a key under construction is permitted * if we have the authorisation token handy */ if (PTR_ERR(key_ref) == -EACCES) { instkey = key_get_instantiation_authkey(id); if (!IS_ERR(instkey)) { key_put(instkey); key_ref = lookup_user_key(id, KEY_LOOKUP_PARTIAL, KEY_AUTHTOKEN_OVERRIDE); if (!IS_ERR(key_ref)) goto okay; } } ret = PTR_ERR(key_ref); goto error; } okay: key = key_ref_to_ptr(key_ref); ret = 0; if (test_bit(KEY_FLAG_KEEP, &key->flags)) { ret = -EPERM; } else { key_set_timeout(key, timeout); notify_key(key, NOTIFY_KEY_SETATTR, 0); } key_put(key); error: return ret; } /* * Assume (or clear) the authority to instantiate the specified key. * * This sets the authoritative token currently in force for key instantiation. * This must be done for a key to be instantiated. It has the effect of making * available all the keys from the caller of the request_key() that created a * key to request_key() calls made by the caller of this function. * * The caller must have the instantiation key in their process keyrings with a * Search permission grant available to the caller. * * If the ID given is 0, then the setting will be cleared and 0 returned. * * If the ID given has a matching an authorisation key, then that key will be * set and its ID will be returned. The authorisation key can be read to get * the callout information passed to request_key(). */ long keyctl_assume_authority(key_serial_t id) { struct key *authkey; long ret; /* special key IDs aren't permitted */ ret = -EINVAL; if (id < 0) goto error; /* we divest ourselves of authority if given an ID of 0 */ if (id == 0) { ret = keyctl_change_reqkey_auth(NULL); goto error; } /* attempt to assume the authority temporarily granted to us whilst we * instantiate the specified key * - the authorisation key must be in the current task's keyrings * somewhere */ authkey = key_get_instantiation_authkey(id); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); goto error; } ret = keyctl_change_reqkey_auth(authkey); if (ret == 0) ret = authkey->serial; key_put(authkey); error: return ret; } /* * Get a key's the LSM security label. * * The key must grant the caller View permission for this to work. * * If there's a buffer, then up to buflen bytes of data will be placed into it. * * If successful, the amount of information available will be returned, * irrespective of how much was copied (including the terminal NUL). */ long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen) { struct key *key, *instkey; key_ref_t key_ref; char *context; long ret; key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_NEED_VIEW); if (IS_ERR(key_ref)) { if (PTR_ERR(key_ref) != -EACCES) return PTR_ERR(key_ref); /* viewing a key under construction is also permitted if we * have the authorisation token handy */ instkey = key_get_instantiation_authkey(keyid); if (IS_ERR(instkey)) return PTR_ERR(instkey); key_put(instkey); key_ref = lookup_user_key(keyid, KEY_LOOKUP_PARTIAL, KEY_AUTHTOKEN_OVERRIDE); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); } key = key_ref_to_ptr(key_ref); ret = security_key_getsecurity(key, &context); if (ret == 0) { /* if no information was returned, give userspace an empty * string */ ret = 1; if (buffer && buflen > 0 && copy_to_user(buffer, "", 1) != 0) ret = -EFAULT; } else if (ret > 0) { /* return as much data as there's room for */ if (buffer && buflen > 0) { if (buflen > ret) buflen = ret; if (copy_to_user(buffer, context, buflen) != 0) ret = -EFAULT; } kfree(context); } key_ref_put(key_ref); return ret; } /* * Attempt to install the calling process's session keyring on the process's * parent process. * * The keyring must exist and must grant the caller LINK permission, and the * parent process must be single-threaded and must have the same effective * ownership as this process and mustn't be SUID/SGID. * * The keyring will be emplaced on the parent when it next resumes userspace. * * If successful, 0 will be returned. */ long keyctl_session_to_parent(void) { struct task_struct *me, *parent; const struct cred *mycred, *pcred; struct callback_head *newwork, *oldwork; key_ref_t keyring_r; struct cred *cred; int ret; keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_NEED_LINK); if (IS_ERR(keyring_r)) return PTR_ERR(keyring_r); ret = -ENOMEM; /* our parent is going to need a new cred struct, a new tgcred struct * and new security data, so we allocate them here to prevent ENOMEM in * our parent */ cred = cred_alloc_blank(); if (!cred) goto error_keyring; newwork = &cred->rcu; cred->session_keyring = key_ref_to_ptr(keyring_r); keyring_r = NULL; init_task_work(newwork, key_change_session_keyring); me = current; rcu_read_lock(); write_lock_irq(&tasklist_lock); ret = -EPERM; oldwork = NULL; parent = rcu_dereference_protected(me->real_parent, lockdep_is_held(&tasklist_lock)); /* the parent mustn't be init and mustn't be a kernel thread */ if (parent->pid <= 1 || !parent->mm) goto unlock; /* the parent must be single threaded */ if (!thread_group_empty(parent)) goto unlock; /* the parent and the child must have different session keyrings or * there's no point */ mycred = current_cred(); pcred = __task_cred(parent); if (mycred == pcred || mycred->session_keyring == pcred->session_keyring) { ret = 0; goto unlock; } /* the parent must have the same effective ownership and mustn't be * SUID/SGID */ if (!uid_eq(pcred->uid, mycred->euid) || !uid_eq(pcred->euid, mycred->euid) || !uid_eq(pcred->suid, mycred->euid) || !gid_eq(pcred->gid, mycred->egid) || !gid_eq(pcred->egid, mycred->egid) || !gid_eq(pcred->sgid, mycred->egid)) goto unlock; /* the keyrings must have the same UID */ if ((pcred->session_keyring && !uid_eq(pcred->session_keyring->uid, mycred->euid)) || !uid_eq(mycred->session_keyring->uid, mycred->euid)) goto unlock; /* cancel an already pending keyring replacement */ oldwork = task_work_cancel_func(parent, key_change_session_keyring); /* the replacement session keyring is applied just prior to userspace * restarting */ ret = task_work_add(parent, newwork, TWA_RESUME); if (!ret) newwork = NULL; unlock: write_unlock_irq(&tasklist_lock); rcu_read_unlock(); if (oldwork) put_cred(container_of(oldwork, struct cred, rcu)); if (newwork) put_cred(cred); return ret; error_keyring: key_ref_put(keyring_r); return ret; } /* * Apply a restriction to a given keyring. * * The caller must have Setattr permission to change keyring restrictions. * * The requested type name may be a NULL pointer to reject all attempts * to link to the keyring. In this case, _restriction must also be NULL. * Otherwise, both _type and _restriction must be non-NULL. * * Returns 0 if successful. */ long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, const char __user *_restriction) { key_ref_t key_ref; char type[32]; char *restriction = NULL; long ret; key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); ret = -EINVAL; if (_type) { if (!_restriction) goto error; ret = key_get_type_from_user(type, _type, sizeof(type)); if (ret < 0) goto error; restriction = strndup_user(_restriction, PAGE_SIZE); if (IS_ERR(restriction)) { ret = PTR_ERR(restriction); goto error; } } else { if (_restriction) goto error; } ret = keyring_restrict(key_ref, _type ? type : NULL, restriction); kfree(restriction); error: key_ref_put(key_ref); return ret; } #ifdef CONFIG_KEY_NOTIFICATIONS /* * Watch for changes to a key. * * The caller must have View permission to watch a key or keyring. */ long keyctl_watch_key(key_serial_t id, int watch_queue_fd, int watch_id) { struct watch_queue *wqueue; struct watch_list *wlist = NULL; struct watch *watch = NULL; struct key *key; key_ref_t key_ref; long ret; if (watch_id < -1 || watch_id > 0xff) return -EINVAL; key_ref = lookup_user_key(id, KEY_LOOKUP_CREATE, KEY_NEED_VIEW); if (IS_ERR(key_ref)) return PTR_ERR(key_ref); key = key_ref_to_ptr(key_ref); wqueue = get_watch_queue(watch_queue_fd); if (IS_ERR(wqueue)) { ret = PTR_ERR(wqueue); goto err_key; } if (watch_id >= 0) { ret = -ENOMEM; if (!key->watchers) { wlist = kzalloc(sizeof(*wlist), GFP_KERNEL); if (!wlist) goto err_wqueue; init_watch_list(wlist, NULL); } watch = kzalloc(sizeof(*watch), GFP_KERNEL); if (!watch) goto err_wlist; init_watch(watch, wqueue); watch->id = key->serial; watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT; ret = security_watch_key(key); if (ret < 0) goto err_watch; down_write(&key->sem); if (!key->watchers) { key->watchers = wlist; wlist = NULL; } ret = add_watch_to_object(watch, key->watchers); up_write(&key->sem); if (ret == 0) watch = NULL; } else { ret = -EBADSLT; if (key->watchers) { down_write(&key->sem); ret = remove_watch_from_object(key->watchers, wqueue, key_serial(key), false); up_write(&key->sem); } } err_watch: kfree(watch); err_wlist: kfree(wlist); err_wqueue: put_watch_queue(wqueue); err_key: key_put(key); return ret; } #endif /* CONFIG_KEY_NOTIFICATIONS */ /* * Get keyrings subsystem capabilities. */ long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen) { size_t size = buflen; if (size > 0) { if (size > sizeof(keyrings_capabilities)) size = sizeof(keyrings_capabilities); if (copy_to_user(_buffer, keyrings_capabilities, size) != 0) return -EFAULT; if (size < buflen && clear_user(_buffer + size, buflen - size) != 0) return -EFAULT; } return sizeof(keyrings_capabilities); } /* * The key control system call */ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { switch (option) { case KEYCTL_GET_KEYRING_ID: return keyctl_get_keyring_ID((key_serial_t) arg2, (int) arg3); case KEYCTL_JOIN_SESSION_KEYRING: return keyctl_join_session_keyring((const char __user *) arg2); case KEYCTL_UPDATE: return keyctl_update_key((key_serial_t) arg2, (const void __user *) arg3, (size_t) arg4); case KEYCTL_REVOKE: return keyctl_revoke_key((key_serial_t) arg2); case KEYCTL_DESCRIBE: return keyctl_describe_key((key_serial_t) arg2, (char __user *) arg3, (unsigned) arg4); case KEYCTL_CLEAR: return keyctl_keyring_clear((key_serial_t) arg2); case KEYCTL_LINK: return keyctl_keyring_link((key_serial_t) arg2, (key_serial_t) arg3); case KEYCTL_UNLINK: return keyctl_keyring_unlink((key_serial_t) arg2, (key_serial_t) arg3); case KEYCTL_SEARCH: return keyctl_keyring_search((key_serial_t) arg2, (const char __user *) arg3, (const char __user *) arg4, (key_serial_t) arg5); case KEYCTL_READ: return keyctl_read_key((key_serial_t) arg2, (char __user *) arg3, (size_t) arg4); case KEYCTL_CHOWN: return keyctl_chown_key((key_serial_t) arg2, (uid_t) arg3, (gid_t) arg4); case KEYCTL_SETPERM: return keyctl_setperm_key((key_serial_t) arg2, (key_perm_t) arg3); case KEYCTL_INSTANTIATE: return keyctl_instantiate_key((key_serial_t) arg2, (const void __user *) arg3, (size_t) arg4, (key_serial_t) arg5); case KEYCTL_NEGATE: return keyctl_negate_key((key_serial_t) arg2, (unsigned) arg3, (key_serial_t) arg4); case KEYCTL_SET_REQKEY_KEYRING: return keyctl_set_reqkey_keyring(arg2); case KEYCTL_SET_TIMEOUT: return keyctl_set_timeout((key_serial_t) arg2, (unsigned) arg3); case KEYCTL_ASSUME_AUTHORITY: return keyctl_assume_authority((key_serial_t) arg2); case KEYCTL_GET_SECURITY: return keyctl_get_security((key_serial_t) arg2, (char __user *) arg3, (size_t) arg4); case KEYCTL_SESSION_TO_PARENT: return keyctl_session_to_parent(); case KEYCTL_REJECT: return keyctl_reject_key((key_serial_t) arg2, (unsigned) arg3, (unsigned) arg4, (key_serial_t) arg5); case KEYCTL_INSTANTIATE_IOV: return keyctl_instantiate_key_iov( (key_serial_t) arg2, (const struct iovec __user *) arg3, (unsigned) arg4, (key_serial_t) arg5); case KEYCTL_INVALIDATE: return keyctl_invalidate_key((key_serial_t) arg2); case KEYCTL_GET_PERSISTENT: return keyctl_get_persistent((uid_t)arg2, (key_serial_t)arg3); case KEYCTL_DH_COMPUTE: return keyctl_dh_compute((struct keyctl_dh_params __user *) arg2, (char __user *) arg3, (size_t) arg4, (struct keyctl_kdf_params __user *) arg5); case KEYCTL_RESTRICT_KEYRING: return keyctl_restrict_keyring((key_serial_t) arg2, (const char __user *) arg3, (const char __user *) arg4); case KEYCTL_PKEY_QUERY: if (arg3 != 0) return -EINVAL; return keyctl_pkey_query((key_serial_t)arg2, (const char __user *)arg4, (struct keyctl_pkey_query __user *)arg5); case KEYCTL_PKEY_ENCRYPT: case KEYCTL_PKEY_DECRYPT: case KEYCTL_PKEY_SIGN: return keyctl_pkey_e_d_s( option, (const struct keyctl_pkey_params __user *)arg2, (const char __user *)arg3, (const void __user *)arg4, (void __user *)arg5); case KEYCTL_PKEY_VERIFY: return keyctl_pkey_verify( (const struct keyctl_pkey_params __user *)arg2, (const char __user *)arg3, (const void __user *)arg4, (const void __user *)arg5); case KEYCTL_MOVE: return keyctl_keyring_move((key_serial_t)arg2, (key_serial_t)arg3, (key_serial_t)arg4, (unsigned int)arg5); case KEYCTL_CAPABILITIES: return keyctl_capabilities((unsigned char __user *)arg2, (size_t)arg3); case KEYCTL_WATCH_KEY: return keyctl_watch_key((key_serial_t)arg2, (int)arg3, (int)arg4); default: return -EOPNOTSUPP; } } |
| 577 666 30 682 1487 683 672 677 10 672 9 10 646 649 649 17 17 17 17 17 16 1 16 1 17 17 17 17 17 17 839 839 839 839 840 130 113 1 17 17 14 1 1 6 9 10 6 6 6 18 18 26 11 9 973 31 935 23 354 38 313 17 10 10 10 10 3 3 3 5 3 2 2 3 31 2 3 2 27 9 28 9 19 19 19 14 4 5 3 7 2 4 1 3 5 17 5 2 2 1 9 3 4 8 14 5 1 2 1 1 9 2 3 7 2 16 2 14 14 2 14 14 122 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright 2007 Hewlett-Packard Development Company, L.P. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ #include <crypto/sha1.h> #include <crypto/sha2.h> #include <linux/slab.h> #include <linux/types.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, .hmac_len = SHA256_DIGEST_SIZE, } }; static bool sctp_hmac_supported(__u16 hmac_id) { return hmac_id < ARRAY_SIZE(sctp_hmac_list) && sctp_hmac_list[hmac_id].hmac_len != 0; } void sctp_auth_key_put(struct sctp_auth_bytes *key) { if (!key) return; if (refcount_dec_and_test(&key->refcnt)) { kfree_sensitive(key); SCTP_DBG_OBJCNT_DEC(keys); } } /* Create a new key structure of a given length */ static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp) { struct sctp_auth_bytes *key; /* Verify that we are not going to overflow INT_MAX */ if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes))) return NULL; /* Allocate the shared key */ key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp); if (!key) return NULL; key->len = key_len; refcount_set(&key->refcnt, 1); SCTP_DBG_OBJCNT_INC(keys); return key; } /* Create a new shared key container with a give key id */ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) { struct sctp_shared_key *new; /* Allocate the shared key container */ new = kzalloc(sizeof(struct sctp_shared_key), gfp); if (!new) return NULL; INIT_LIST_HEAD(&new->key_list); refcount_set(&new->refcnt, 1); new->key_id = key_id; return new; } /* Free the shared key structure */ static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key) { BUG_ON(!list_empty(&sh_key->key_list)); sctp_auth_key_put(sh_key->key); sh_key->key = NULL; kfree(sh_key); } void sctp_auth_shkey_release(struct sctp_shared_key *sh_key) { if (refcount_dec_and_test(&sh_key->refcnt)) sctp_auth_shkey_destroy(sh_key); } void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key) { refcount_inc(&sh_key->refcnt); } /* Destroy the entire key list. This is done during the * associon and endpoint free process. */ void sctp_auth_destroy_keys(struct list_head *keys) { struct sctp_shared_key *ep_key; struct sctp_shared_key *tmp; if (list_empty(keys)) return; key_for_each_safe(ep_key, tmp, keys) { list_del_init(&ep_key->key_list); sctp_auth_shkey_release(ep_key); } } /* Compare two byte vectors as numbers. Return values * are: * 0 - vectors are equal * < 0 - vector 1 is smaller than vector2 * > 0 - vector 1 is greater than vector2 * * Algorithm is: * This is performed by selecting the numerically smaller key vector... * If the key vectors are equal as numbers but differ in length ... * the shorter vector is considered smaller * * Examples (with small values): * 000123456789 > 123456789 (first number is longer) * 000123456789 < 234567891 (second number is larger numerically) * 123456789 > 2345678 (first number is both larger & longer) */ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1, struct sctp_auth_bytes *vector2) { int diff; int i; const __u8 *longer; diff = vector1->len - vector2->len; if (diff) { longer = (diff > 0) ? vector1->data : vector2->data; /* Check to see if the longer number is * lead-zero padded. If it is not, it * is automatically larger numerically. */ for (i = 0; i < abs(diff); i++) { if (longer[i] != 0) return diff; } } /* lengths are the same, compare numbers */ return memcmp(vector1->data, vector2->data, vector1->len); } /* * Create a key vector as described in SCTP-AUTH, Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ static struct sctp_auth_bytes *sctp_auth_make_key_vector( struct sctp_random_param *random, struct sctp_chunks_param *chunks, struct sctp_hmac_algo_param *hmacs, gfp_t gfp) { struct sctp_auth_bytes *new; __u32 len; __u32 offset = 0; __u16 random_len, hmacs_len, chunks_len = 0; random_len = ntohs(random->param_hdr.length); hmacs_len = ntohs(hmacs->param_hdr.length); if (chunks) chunks_len = ntohs(chunks->param_hdr.length); len = random_len + hmacs_len + chunks_len; new = sctp_auth_create_key(len, gfp); if (!new) return NULL; memcpy(new->data, random, random_len); offset += random_len; if (chunks) { memcpy(new->data + offset, chunks, chunks_len); offset += chunks_len; } memcpy(new->data + offset, hmacs, hmacs_len); return new; } /* Make a key vector based on our local parameters */ static struct sctp_auth_bytes *sctp_auth_make_local_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector( (struct sctp_random_param *)asoc->c.auth_random, (struct sctp_chunks_param *)asoc->c.auth_chunks, (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp); } /* Make a key vector based on peer's parameters */ static struct sctp_auth_bytes *sctp_auth_make_peer_vector( const struct sctp_association *asoc, gfp_t gfp) { return sctp_auth_make_key_vector(asoc->peer.peer_random, asoc->peer.peer_chunks, asoc->peer.peer_hmacs, gfp); } /* Set the value of the association shared key base on the parameters * given. The algorithm is: * From the endpoint pair shared keys and the key vectors the * association shared keys are computed. This is performed by selecting * the numerically smaller key vector and concatenating it to the * endpoint pair shared key, and then concatenating the numerically * larger key vector to that. The result of the concatenation is the * association shared key. */ static struct sctp_auth_bytes *sctp_auth_asoc_set_secret( struct sctp_shared_key *ep_key, struct sctp_auth_bytes *first_vector, struct sctp_auth_bytes *last_vector, gfp_t gfp) { struct sctp_auth_bytes *secret; __u32 offset = 0; __u32 auth_len; auth_len = first_vector->len + last_vector->len; if (ep_key->key) auth_len += ep_key->key->len; secret = sctp_auth_create_key(auth_len, gfp); if (!secret) return NULL; if (ep_key->key) { memcpy(secret->data, ep_key->key->data, ep_key->key->len); offset += ep_key->key->len; } memcpy(secret->data + offset, first_vector->data, first_vector->len); offset += first_vector->len; memcpy(secret->data + offset, last_vector->data, last_vector->len); return secret; } /* Create an association shared key. Follow the algorithm * described in SCTP-AUTH, Section 6.1 */ static struct sctp_auth_bytes *sctp_auth_asoc_create_secret( const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *local_key_vector; struct sctp_auth_bytes *peer_key_vector; struct sctp_auth_bytes *first_vector, *last_vector; struct sctp_auth_bytes *secret = NULL; int cmp; /* Now we need to build the key vectors * SCTP-AUTH , Section 6.1 * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO * parameter sent by each endpoint are concatenated as byte vectors. * These parameters include the parameter type, parameter length, and * the parameter value, but padding is omitted; all padding MUST be * removed from this concatenation before proceeding with further * computation of keys. Parameters which were not sent are simply * omitted from the concatenation process. The resulting two vectors * are called the two key vectors. */ local_key_vector = sctp_auth_make_local_vector(asoc, gfp); peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp); if (!peer_key_vector || !local_key_vector) goto out; /* Figure out the order in which the key_vectors will be * added to the endpoint shared key. * SCTP-AUTH, Section 6.1: * This is performed by selecting the numerically smaller key * vector and concatenating it to the endpoint pair shared * key, and then concatenating the numerically larger key * vector to that. If the key vectors are equal as numbers * but differ in length, then the concatenation order is the * endpoint shared key, followed by the shorter key vector, * followed by the longer key vector. Otherwise, the key * vectors are identical, and may be concatenated to the * endpoint pair key in any order. */ cmp = sctp_auth_compare_vectors(local_key_vector, peer_key_vector); if (cmp < 0) { first_vector = local_key_vector; last_vector = peer_key_vector; } else { first_vector = peer_key_vector; last_vector = local_key_vector; } secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector, gfp); out: sctp_auth_key_put(local_key_vector); sctp_auth_key_put(peer_key_vector); return secret; } /* * Populate the association overlay list with the list * from the endpoint. */ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp) { struct sctp_shared_key *sh_key; struct sctp_shared_key *new; BUG_ON(!list_empty(&asoc->endpoint_shared_keys)); key_for_each(sh_key, &ep->endpoint_shared_keys) { new = sctp_auth_shkey_create(sh_key->key_id, gfp); if (!new) goto nomem; new->key = sh_key->key; sctp_auth_key_hold(new->key); list_add(&new->key_list, &asoc->endpoint_shared_keys); } return 0; nomem: sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); return -ENOMEM; } /* Public interface to create the association shared key. * See code above for the algorithm. */ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) { struct sctp_auth_bytes *secret; struct sctp_shared_key *ep_key; struct sctp_chunk *chunk; /* If we don't support AUTH, or peer is not capable * we don't need to do anything. */ if (!asoc->peer.auth_capable) return 0; /* If the key_id is non-zero and we couldn't find an * endpoint pair shared key, we can't compute the * secret. * For key_id 0, endpoint pair shared key is a NULL key. */ ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id); BUG_ON(!ep_key); secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!secret) return -ENOMEM; sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; asoc->shkey = ep_key; /* Update send queue in case any chunk already in there now * needs authenticating */ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) { chunk->auth = 1; if (!chunk->shkey) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } } } return 0; } /* Find the endpoint pair shared key based on the key_id */ struct sctp_shared_key *sctp_auth_get_shkey( const struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { if (key->key_id == key_id) { if (!key->deactivated) return key; break; } } return NULL; } const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } /* Get an hmac description information that we can use to build * the AUTH chunk */ const struct sctp_hmac * sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; __u16 id = 0; int i; /* If we have a default entry, use it */ if (asoc->default_hmac_id) return &sctp_hmac_list[asoc->default_hmac_id]; /* Since we do not have a default entry, find the first entry * we support and return that. Do not cache that id. */ hmacs = asoc->peer.peer_hmacs; if (!hmacs) return NULL; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) return &sctp_hmac_list[id]; } return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) { int found = 0; int i; for (i = 0; i < n_elts; i++) { if (hmac_id == hmacs[i]) { found = 1; break; } } return found; } /* See if the HMAC_ID is one that we claim as supported */ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, __be16 hmac_id) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; if (!asoc) return 0; hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs; n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id); } /* Cache the default HMAC id. This to follow this text from SCTP-AUTH: * Section 6.1: * The receiver of a HMAC-ALGO parameter SHOULD use the first listed * algorithm it supports. */ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { __u16 id; int i; int n_params; /* if the default id is already set, use it */ if (asoc->default_hmac_id) return; n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } } } /* Check to see if the given chunk is supposed to be authenticated */ static int __sctp_auth_cid(enum sctp_cid chunk, struct sctp_chunks_param *param) { unsigned short len; int found = 0; int i; if (!param || param->param_hdr.length == 0) return 0; len = ntohs(param->param_hdr.length) - sizeof(struct sctp_paramhdr); /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH * chunks MUST NOT be listed in the CHUNKS parameter. However, if * a CHUNKS parameter is received then the types for INIT, INIT-ACK, * SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored. */ for (i = 0; !found && i < len; i++) { switch (param->chunks[i]) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: case SCTP_CID_AUTH: break; default: if (param->chunks[i] == chunk) found = 1; break; } } return found; } /* Check if peer requested that this chunk is authenticated */ int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, asoc->peer.peer_chunks); } /* Check if we requested that peer authenticate this chunk. */ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) { if (!asoc) return 0; if (!asoc->peer.auth_capable) return 0; return __sctp_auth_cid(chunk, (struct sctp_chunks_param *)asoc->c.auth_chunks); } /* SCTP-AUTH: Section 6.2: * The sender MUST calculate the MAC as described in RFC2104 [2] using * the hash function H as described by the MAC Identifier and the shared * association key K based on the endpoint pair shared key described by * the shared key identifier. The 'data' used for the computation of * the AUTH-chunk is given by the AUTH chunk with its HMAC field set to * zero (as shown in Figure 6) followed by all chunks that are placed * after the AUTH chunk in the SCTP packet. */ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sk_buff *skb, struct sctp_auth_chunk *auth, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; __u16 key_id, hmac_id; int free_key = 0; size_t data_len; __u8 *digest; /* Extract the info we need: * - hmac id * - key id */ key_id = ntohs(auth->auth_hdr.shkey_id); hmac_id = ntohs(auth->auth_hdr.hmac_id); if (key_id == asoc->active_key_id) asoc_key = asoc->asoc_shared_key; else { /* ep_key can't be NULL here */ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!asoc_key) return; free_key = 1; } data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } else { WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, (const u8 *)auth, data_len, digest); } if (free_key) sctp_auth_key_put(asoc_key); } /* API Helpers */ /* Add a chunk to the endpoint authenticated chunk list */ int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id) { struct sctp_chunks_param *p = ep->auth_chunk_list; __u16 nchunks; __u16 param_len; /* If this chunk is already specified, we are done */ if (__sctp_auth_cid(chunk_id, p)) return 0; /* Check if we can add this chunk to the array */ param_len = ntohs(p->param_hdr.length); nchunks = param_len - sizeof(struct sctp_paramhdr); if (nchunks == SCTP_NUM_CHUNK_TYPES) return -EINVAL; p->chunks[nchunks] = chunk_id; p->param_hdr.length = htons(param_len + 1); return 0; } /* Add hmac identifires to the endpoint list of supported hmac ids */ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs) { int has_sha1 = 0; __u16 id; int i; /* Scan the list looking for unsupported id. Also make sure that * SHA1 is specified. */ for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; } if (!has_sha1) return -EINVAL; for (i = 0; i < hmacs->shmac_num_idents; i++) ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]); ep->auth_hmacs_list->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + hmacs->shmac_num_idents * sizeof(__u16)); return 0; } /* Set a new shared key on either endpoint or association. If the * key with a same ID already exists, replace the key (remove the * old key and add a new one). */ int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key) { struct sctp_shared_key *cur_key, *shkey; struct sctp_auth_bytes *key; struct list_head *sh_keys; int replace = 0; /* Try to find the given key id to see if * we are doing a replace, or adding a new key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(shkey, sh_keys) { if (shkey->key_id == auth_key->sca_keynumber) { replace = 1; break; } } cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL); if (!cur_key) return -ENOMEM; /* Create a new key data based on the info passed in */ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); if (!key) { kfree(cur_key); return -ENOMEM; } memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); cur_key->key = key; if (!replace) { list_add(&cur_key->key_list, sh_keys); return 0; } list_del_init(&shkey->key_list); list_add(&cur_key->key_list, sh_keys); if (asoc && asoc->active_key_id == auth_key->sca_keynumber && sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { list_del_init(&cur_key->key_list); sctp_auth_shkey_release(cur_key); list_add(&shkey->key_list, sh_keys); return -ENOMEM; } sctp_auth_shkey_release(shkey); return 0; } int sctp_auth_set_active_key(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found || key->deactivated) return -EINVAL; if (asoc) { __u16 active_key_id = asoc->active_key_id; asoc->active_key_id = key_id; if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { asoc->active_key_id = active_key_id; return -ENOMEM; } } else ep->active_key_id = key_id; return 0; } int sctp_auth_del_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* Delete the shared key */ list_del_init(&key->key_list); sctp_auth_shkey_release(key); return 0; } int sctp_auth_deact_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id) { struct sctp_shared_key *key; struct list_head *sh_keys; int found = 0; /* The key identifier MUST NOT be the current active key * The key identifier MUST correst to an existing key */ if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; if (asoc->active_key_id == key_id) return -EINVAL; sh_keys = &asoc->endpoint_shared_keys; } else { if (!ep->auth_enable) return -EACCES; if (ep->active_key_id == key_id) return -EINVAL; sh_keys = &ep->endpoint_shared_keys; } key_for_each(key, sh_keys) { if (key->key_id == key_id) { found = 1; break; } } if (!found) return -EINVAL; /* refcnt == 1 and !list_empty mean it's not being used anywhere * and deactivated will be set, so it's time to notify userland * that this shkey can be freed. */ if (asoc && !list_empty(&key->key_list) && refcount_read(&key->refcnt) == 1) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, key->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } key->deactivated = 1; return 0; } int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. */ if (!ep->auth_hmacs_list) { struct sctp_hmac_algo_param *auth_hmacs; auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, SCTP_AUTH_NUM_HMACS), gfp); if (!auth_hmacs) goto nomem; /* Initialize the HMACS parameter. * SCTP-AUTH: Section 3.3 * Every endpoint supporting SCTP chunk authentication MUST * support the HMAC based on the SHA-1 algorithm. */ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; auth_hmacs->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + 2); auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); ep->auth_hmacs_list = auth_hmacs; } if (!ep->auth_chunk_list) { struct sctp_chunks_param *auth_chunks; auth_chunks = kzalloc(sizeof(*auth_chunks) + SCTP_NUM_CHUNK_TYPES, gfp); if (!auth_chunks) goto nomem; /* Initialize the CHUNKS parameter */ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; auth_chunks->param_hdr.length = htons(sizeof(struct sctp_paramhdr)); ep->auth_chunk_list = auth_chunks; } return 0; nomem: /* Free all allocations */ kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) { kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; } |
| 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1 1 2 2 154 154 154 154 154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, }, }; static int devlink_param_generic_verify(const struct devlink_param *param) { /* verify it match generic parameter by id and name */ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; if (strcmp(param->name, devlink_param_generic[param->id].name)) return -ENOENT; WARN_ON(param->type != devlink_param_generic[param->id].type); return 0; } static int devlink_param_driver_verify(const struct devlink_param *param) { int i; if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; /* verify no such name in generic params */ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) if (!strcmp(param->name, devlink_param_generic[i].name)) return -EEXIST; return 0; } static struct devlink_param_item * devlink_param_find_by_name(struct xarray *params, const char *param_name) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(params, param_id, param_item) { if (!strcmp(param_item->param->name, param_name)) return param_item; } return NULL; } static struct devlink_param_item * devlink_param_find_by_id(struct xarray *params, u32 param_id) { return xa_load(params, param_id); } static bool devlink_param_cmode_is_supported(const struct devlink_param *param, enum devlink_param_cmode cmode) { return test_bit(cmode, ¶m->supported_cmodes); } static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack) { if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx, extack); } static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, enum devlink_param_cmode cmode, union devlink_param_value val) { struct nlattr *param_value_attr; param_value_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUE); if (!param_value_attr) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) goto value_nest_cancel; switch (type) { case DEVLINK_PARAM_TYPE_U8: if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U16: if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U32: if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U64: if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu64)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_STRING: if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vstr)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_BOOL: if (val.vbool && nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) goto value_nest_cancel; break; } nla_nest_end(msg, param_value_attr); return 0; value_nest_cancel: nla_nest_cancel(msg, param_value_attr); nla_put_failure: return -EMSGSIZE; } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; void *hdr; int err; int i; /* Get value from driver part to driverinit configuration mode */ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!devlink_param_cmode_is_supported(param, i)) continue; if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { if (param_item->driverinit_value_new_valid) param_value[i] = param_item->driverinit_value_new; else if (param_item->driverinit_value_valid) param_value[i] = param_item->driverinit_value; else return -EOPNOTSUPP; } else { ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); if (err) return err; param_value[i] = ctx.val; } param_value_set[i] = true; } hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (cmd == DEVLINK_CMD_PORT_PARAM_GET || cmd == DEVLINK_CMD_PORT_PARAM_NEW || cmd == DEVLINK_CMD_PORT_PARAM_DEL) if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) goto genlmsg_cancel; param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); if (!param_values_list) goto param_nest_cancel; for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, i, param_value[i]); if (err) goto values_list_nest_cancel; } nla_nest_end(msg, param_values_list); nla_nest_end(msg, param_attr); genlmsg_end(msg, hdr); return 0; values_list_nest_cancel: nla_nest_end(msg, param_values_list); param_nest_cancel: nla_nest_cancel(msg, param_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_param_notify(struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); /* devlink_notify_register() / devlink_notify_unregister() * will replay the notifications if the params are added/removed * outside of the lifetime of the instance. */ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } static void devlink_params_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) devlink_param_notify(devlink, 0, param_item, cmd); } void devlink_params_notify_register(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); } void devlink_params_notify_unregister(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); } static int devlink_nl_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; unsigned long param_id; int err = 0; xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = param_id; break; } } return err; } int devlink_nl_param_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); } static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) { if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]); return 0; } static int devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { struct nlattr *param_data; int len; param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: if (nla_len(param_data) != sizeof(u8)) return -EINVAL; value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: if (nla_len(param_data) != sizeof(u16)) return -EINVAL; value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: if (nla_len(param_data) != sizeof(u32)) return -EINVAL; value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_U64: if (nla_len(param_data) != sizeof(u64)) return -EINVAL; value->vu64 = nla_get_u64(param_data); break; case DEVLINK_PARAM_TYPE_STRING: len = strnlen(nla_data(param_data), nla_len(param_data)); if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: if (param_data && nla_len(param_data)) return -EINVAL; value->vbool = nla_get_flag(param_data); break; } return 0; } static struct devlink_param_item * devlink_param_get_from_info(struct xarray *params, struct genl_info *info) { char *param_name; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); return devlink_param_find_by_name(params, param_name); } int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; param_item = devlink_param_get_from_info(&devlink->params, info); if (!param_item) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, unsigned int port_index, struct xarray *params, struct genl_info *info, enum devlink_command cmd) { enum devlink_param_type param_type; struct devlink_param_gset_ctx ctx; enum devlink_param_cmode cmode; struct devlink_param_item *param_item; const struct devlink_param *param; union devlink_param_value value; int err = 0; param_item = devlink_param_get_from_info(params, info); if (!param_item) return -EINVAL; param = param_item->param; err = devlink_param_type_get_from_info(info, ¶m_type); if (err) return err; if (param_type != param->type) return -EINVAL; err = devlink_param_value_get_from_info(param, info, &value); if (err) return err; if (param->validate) { err = param->validate(devlink, param->id, value, info->extack); if (err) return err; } if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) return -EINVAL; cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); if (!devlink_param_cmode_is_supported(param, cmode)) return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { param_item->driverinit_value_new = value; param_item->driverinit_value_new_valid = true; } else { if (!param->set) return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; err = devlink_param_set(devlink, param, &ctx, info->extack); if (err) return err; } devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, info, DEVLINK_CMD_PARAM_NEW); } int devlink_nl_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); return msg->len; } int devlink_nl_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } int devlink_nl_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } static int devlink_param_verify(const struct devlink_param *param) { if (!param || !param->name || !param->supported_cmodes) return -EINVAL; if (param->generic) return devlink_param_generic_verify(param); else return devlink_param_driver_verify(param); } static int devlink_param_register(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; int err; WARN_ON(devlink_param_verify(param)); WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) WARN_ON(param->get || param->set); else WARN_ON(!param->get || !param->set); param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); if (!param_item) return -ENOMEM; param_item->param = param; err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); if (err) goto err_xa_insert; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; err_xa_insert: kfree(param_item); return err; } static void devlink_param_unregister(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param->id); if (WARN_ON(!param_item)) return; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); xa_erase(&devlink->params, param->id); kfree(param_item); } /** * devl_params_register - register configuration parameters * * @devlink: devlink * @params: configuration parameters array * @params_count: number of parameters provided * * Register the configuration parameters supported by the driver. */ int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i, err; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); if (err) goto rollback; } return 0; rollback: if (!i) return err; for (param--; i > 0; i--, param--) devlink_param_unregister(devlink, param); return err; } EXPORT_SYMBOL_GPL(devl_params_register); int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { int err; devl_lock(devlink); err = devl_params_register(devlink, params, params_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_params_register); /** * devl_params_unregister - unregister configuration parameters * @devlink: devlink * @params: configuration parameters to unregister * @params_count: number of parameters provided */ void devl_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } EXPORT_SYMBOL_GPL(devl_params_unregister); void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { devl_lock(devlink); devl_params_unregister(devlink, params, params_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_params_unregister); /** * devl_param_driverinit_value_get - get configuration parameter * value for driver initializing * * @devlink: devlink * @param_id: parameter ID * @val: pointer to store the value of parameter in driverinit * configuration mode * * This function should be used by the driver to get driverinit * configuration for initialization after reload command. * * Note that lockless call of this function relies on the * driver to maintain following basic sane behavior: * 1) Driver ensures a call to this function cannot race with * registering/unregistering the parameter with the same parameter ID. * 2) Driver ensures a call to this function cannot race with * devl_param_driverinit_value_set() call with the same parameter ID. * 3) Driver ensures a call to this function cannot race with * reload operation. * If the driver is not able to comply, it has to take the devlink->lock * while calling this. */ int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *val) { struct devlink_param_item *param_item; if (WARN_ON(!devlink_reload_supported(devlink->ops))) return -EOPNOTSUPP; param_item = devlink_param_find_by_id(&devlink->params, param_id); if (!param_item) return -EINVAL; if (!param_item->driverinit_value_valid) return -EOPNOTSUPP; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return -EOPNOTSUPP; *val = param_item->driverinit_value; return 0; } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); /** * devl_param_driverinit_value_set - set value of configuration * parameter for driverinit * configuration mode * * @devlink: devlink * @param_id: parameter ID * @init_val: value of parameter to set for driverinit configuration mode * * This function should be used by the driver to set driverinit * configuration mode default value. */ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { struct devlink_param_item *param_item; devl_assert_locked(devlink); param_item = devlink_param_find_by_id(&devlink->params, param_id); if (WARN_ON(!param_item)) return; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return; param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); void devlink_params_driverinit_load_new(struct devlink *devlink) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) { if (!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT) || !param_item->driverinit_value_new_valid) continue; param_item->driverinit_value = param_item->driverinit_value_new; param_item->driverinit_value_valid = true; param_item->driverinit_value_new_valid = false; } } /** * devl_param_value_changed - notify devlink on a parameter's value * change. Should be called by the driver * right after the change. * * @devlink: devlink * @param_id: parameter ID * * This function should be used by the driver to notify devlink on value * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function */ void devl_param_value_changed(struct devlink *devlink, u32 param_id) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param_id); WARN_ON(!param_item); devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_value_changed); |
| 17 29 23 22 1 16 20 15 14 13 1 9 11 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <net/dst.h> #include <net/flow.h> #include <net/ipv6.h> #include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_TCPMSS.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; else return opt[offset+1]; } static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb, unsigned int family) { struct flowi fl; struct rtable *rt = NULL; u_int32_t mtu = ~0U; if (family == PF_INET) { struct flowi4 *fl4 = &fl.u.ip4; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = ip_hdr(skb)->saddr; } else { struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); } return mtu; } static int tcpmss_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, unsigned int family, unsigned int tcphoff, unsigned int minlen) { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; int len, tcp_hdrlen; unsigned int i; __be16 oldval; u16 newmss; u8 *opt; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return 0; if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; if (len < (int)sizeof(struct tcphdr)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr)) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = xt_net(par); unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", min_mtu); return -1; } newmss = min_mtu - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; /* Never increase MSS, even when setting it, as * doing so results in problems for hosts that rely * on MSS being set correctly. */ if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), false); return 0; } } /* There is data after the header so the option can't be added * without moving it, and doing so may make the SYN packet * itself too large. Accept the packet unmodified instead. */ if (len > tcp_hdrlen) return 0; /* tcph->doff has 4 bits, do not wrap it to 0 */ if (tcp_hdrlen >= 15 * 4) return 0; /* * MSS Option not found ?! add it.. */ if (skb_tailroom(skb) < TCPOLEN_MSS) { if (pskb_expand_head(skb, 0, TCPOLEN_MSS - skb_tailroom(skb), GFP_ATOMIC)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } skb_put(skb, TCPOLEN_MSS); /* * IPv4: RFC 1122 states "If an MSS option is not received at * connection setup, TCP MUST assume a default send MSS of 536". * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum * length IPv6 header of 60, ergo the default MSS value is 1220 * Since no MSS was provided, we must use the default values */ if (xt_family(par) == NFPROTO_IPV4) newmss = min(newmss, (u16)536); else newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; ret = tcpmss_mangle_packet(skb, par, PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; __be16 frag_off, oldlen, newlen; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par, PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; } #endif /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; return false; } static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #endif }; static int __init tcpmss_tg_init(void) { return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } static void __exit tcpmss_tg_exit(void) { xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } module_init(tcpmss_tg_init); module_exit(tcpmss_tg_exit); |
| 25 25 3 14 13 4 2 1 1 6 14 15 15 3 2 1 1 1 4 2 2 2 4 23 23 23 23 1 1 1 4154 2 2 2 12 12 2 2 17 17 17 2 2 1 45 2 2 29 2 38 2 2 29 30 14 24 2 2 26 28 15 34 17 15 11 4 14 78 78 17 4142 4144 4142 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 | // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */ |
| 119 1519 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); struct file *needs_masking; /* * 'mask' is zero for an out-of-bounds fd, all ones for ok. * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. * * Accessing fdt->fd[0] is ok, but needs masking of the result. */ needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) { return test_bit(fd, files_fdtable(files)->close_on_exec); } struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); struct fd_range { unsigned int from, to; }; struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int close_fd(unsigned int fd); extern struct file *file_close_fd(unsigned int fd); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */ |
| 11424 10877 1941 822 847 3 3 5 285 665 1 437 5 225 217 607 280 277 280 27 26 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal schedule timeout and sleeping functions */ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include "tick-internal.h" /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns: 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { pr_err("%s: wrong timeout value %lx\n", __func__, timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); timer.timer.expires = expire; add_timer(&timer.timer); schedule(); timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * __set_current_state() can be used in schedule_timeout_*() functions, because * schedule_timeout() calls schedule() unconditionally. */ /** * schedule_timeout_interruptible - sleep until timeout (interruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); /** * schedule_timeout_killable - sleep until timeout (killable) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_KILLABLE before starting the timeout. */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); /** * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /** * schedule_timeout_idle - sleep until timeout (idle) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_IDLE before starting the timeout. It is similar to * schedule_timeout_uninterruptible(), except this task will not contribute to * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used * * Details are explained in schedule_hrtimeout_range() function description as * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns: 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * See schedule_hrtimeout_range() for details. @delta argument of * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Requested sleep duration in milliseconds * * msleep() uses jiffy based timeouts for the sleep duration. Because of the * design of the timer wheel, the maximum additional percentage delay (slack) is * 12.5%. This is only valid for timers which will end up in level 1 or a higher * level of the timer wheel. For explanation of those 12.5% please check the * detailed description about the basics of the timer wheel. * * The slack of timers which will end up in level 0 depends on sleep duration * (msecs) and HZ configuration and can be calculated in the following way (with * the timer wheel design restriction that the slack is not less than 12.5%): * * ``slack = MSECS_PER_TICK / msecs`` * * When the allowed slack of the callsite is known, the calculation could be * turned around to find the minimal allowed sleep duration to meet the * constraints. For example: * * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: * all sleep durations greater or equal 4ms will meet the constraints. * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: * all sleep durations greater or equal 8ms will meet the constraints. * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: * all sleep durations greater or equal 16ms will meet the constraints. * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: * all sleep durations greater or equal 32ms will meet the constraints. * * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Requested sleep duration in milliseconds * * See msleep() for some basic information. * * The difference between msleep() and msleep_interruptible() is that the sleep * could be interrupted by a signal delivery and then returns early. * * Returns: The remaining time of the sleep duration transformed to msecs (see * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * usleep_range_state() sleeps at least for the minimum specified time but not * longer than the maximum specified amount of time. The range might reduce * power usage by allowing hrtimers to coalesce an already scheduled interrupt * with this hrtimer. In the worst case, an interrupt is scheduled for the upper * bound. * * The sleeping task is set to the specified state before starting the sleep. * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() or its variants instead of udelay(). The sleep improves * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; if (WARN_ON_ONCE(max < min)) delta = 0; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state); |
| 2 271 59 2 4 2 24 19 19 26 19 4 10 10 10 10 6 6 6 14 14 6 6 13 13 211 211 34 13 10 25 10 39 23 36 1 31 8 42 21 11 12 13 32 37 23 4 1 3 4 4 4 4 4 4 4 4 4 4 3 1 4 4 4 4 29 166 9 7 2 202 7 195 197 5 190 4 9 1 57 45 71 6 79 1 192 1 191 2 187 133 56 189 189 133 43 13 13 7 11 181 2 7 6 37 30 29 121 5 124 51 175 8 5 1 1 2 2 8 13 195 194 1 193 1 194 34 34 1 195 194 1 30 30 2 92 91 45 92 92 92 92 15 4 262 160 155 92 231 251 11 256 61 251 67 243 19 248 12 253 10 125 2 4 53 224 58 4 41 41 271 274 41 271 8 261 204 214 212 212 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/isofs/inode.c * * (C) 1991 Linus Torvalds - minix filesystem * 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. * 1994 Eberhard Mönkeberg - multi session handling. * 1995 Mark Dobie - allow mounting of some weird VideoCDs and PhotoCDs. * 1997 Gordon Chaffee - Joliet CDs * 1998 Eric Lammerts - ISO 9660 Level 3 * 2004 Paul Serice - Inode Support pushed out from 4GB to 128GB * 2004 Paul Serice - NFS Export Operations */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/nls.h> #include <linux/ctype.h> #include <linux/statfs.h> #include <linux/cdrom.h> #include <linux/mpage.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "isofs.h" #include "zisofs.h" /* max tz offset is 13 hours */ #define MAX_TZ_OFFSET (52*15*60) #define BEQUIET static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif static void isofs_put_super(struct super_block *sb) { struct isofs_sb_info *sbi = ISOFS_SB(sb); #ifdef CONFIG_JOLIET unload_nls(sbi->s_nls_iocharset); #endif kfree(sbi); sb->s_fs_info = NULL; return; } static int isofs_read_inode(struct inode *, int relocated); static int isofs_statfs (struct dentry *, struct kstatfs *); static int isofs_show_options(struct seq_file *, struct dentry *); static struct kmem_cache *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void isofs_free_inode(struct inode *inode) { kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } static void init_once(void *foo) { struct iso_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!isofs_inode_cachep) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(isofs_inode_cachep); } static int isofs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); if (!(fc->sb_flags & SB_RDONLY)) return -EROFS; return 0; } static const struct super_operations isofs_sops = { .alloc_inode = isofs_alloc_inode, .free_inode = isofs_free_inode, .put_super = isofs_put_super, .statfs = isofs_statfs, .show_options = isofs_show_options, }; static const struct dentry_operations isofs_dentry_ops[] = { { .d_hash = isofs_hashi, .d_compare = isofs_dentry_cmpi, }, #ifdef CONFIG_JOLIET { .d_hash = isofs_hash_ms, .d_compare = isofs_dentry_cmp_ms, }, { .d_hash = isofs_hashi_ms, .d_compare = isofs_dentry_cmpi_ms, }, #endif }; struct isofs_options{ unsigned int rock:1; unsigned int joliet:1; unsigned int cruft:1; unsigned int hide:1; unsigned int showassoc:1; unsigned int nocompress:1; unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; unsigned char map; unsigned char check; unsigned int blocksize; umode_t fmode; umode_t dmode; kgid_t gid; kuid_t uid; char *iocharset; /* LVE */ s32 session; s32 sbsector; }; /* * Compute the hash for the isofs name corresponding to the dentry. */ static int isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; char c; unsigned long hash; len = qstr->len; name = qstr->name; if (ms) { while (len && name[len-1] == '.') len--; } hash = init_name_hash(dentry); while (len--) { c = tolower(*name++); hash = partial_name_hash(c, hash); } qstr->hash = end_name_hash(hash); return 0; } /* * Compare of two isofs names. */ static int isofs_dentry_cmp_common( unsigned int len, const char *str, const struct qstr *name, int ms, int ci) { int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ alen = name->len; blen = len; if (ms) { while (alen && name->name[alen-1] == '.') alen--; while (blen && str[blen-1] == '.') blen--; } if (alen == blen) { if (ci) { if (strncasecmp(name->name, str, alen) == 0) return 0; } else { if (strncmp(name->name, str, alen) == 0) return 0; } } return 1; } static int isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } static int isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 1); } #ifdef CONFIG_JOLIET /* * Compute the hash for the isofs name corresponding to the dentry. */ static int isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; len = qstr->len; name = qstr->name; if (ms) { while (len && name[len-1] == '.') len--; } qstr->hash = full_name_hash(dentry, name, len); return 0; } static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } static int isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 1); } #endif enum { Opt_block, Opt_check, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset, Opt_map, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, }; static const struct constant_table isofs_param_map[] = { {"acorn", 'a'}, {"a", 'a'}, {"normal", 'n'}, {"n", 'n'}, {"off", 'o'}, {"o", 'o'}, {} }; static const struct constant_table isofs_param_check[] = { {"relaxed", 'r'}, {"r", 'r'}, {"strict", 's'}, {"s", 's'}, {} }; static const struct fs_parameter_spec isofs_param_spec[] = { fsparam_flag ("norock", Opt_norock), fsparam_flag ("nojoliet", Opt_nojoliet), fsparam_flag ("unhide", Opt_unhide), fsparam_flag ("hide", Opt_hide), fsparam_flag ("showassoc", Opt_showassoc), fsparam_flag ("cruft", Opt_cruft), fsparam_flag ("utf8", Opt_utf8), fsparam_string ("iocharset", Opt_iocharset), fsparam_enum ("map", Opt_map, isofs_param_map), fsparam_u32 ("session", Opt_session), fsparam_u32 ("sbsector", Opt_sb), fsparam_enum ("check", Opt_check, isofs_param_check), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), /* Note: mode/dmode historically accepted %u not strictly %o */ fsparam_u32 ("mode", Opt_mode), fsparam_u32 ("dmode", Opt_dmode), fsparam_flag ("overriderockperm", Opt_overriderockperm), fsparam_u32 ("block", Opt_block), fsparam_string ("conv", Opt_ignore), fsparam_flag ("nocompress", Opt_nocompress), {} }; static int isofs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct isofs_options *popt = fc->fs_private; struct fs_parse_result result; int opt; unsigned int n; /* There are no remountable options */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) return 0; opt = fs_parse(fc, isofs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_norock: popt->rock = 0; break; case Opt_nojoliet: popt->joliet = 0; break; case Opt_hide: popt->hide = 1; break; case Opt_unhide: case Opt_showassoc: popt->showassoc = 1; break; case Opt_cruft: popt->cruft = 1; break; #ifdef CONFIG_JOLIET case Opt_utf8: kfree(popt->iocharset); popt->iocharset = kstrdup("utf8", GFP_KERNEL); if (!popt->iocharset) return -ENOMEM; break; case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = kstrdup(param->string, GFP_KERNEL); if (!popt->iocharset) return -ENOMEM; break; #endif case Opt_map: popt->map = result.uint_32; break; case Opt_session: n = result.uint_32; /* * Track numbers are supposed to be in range 1-99, the * mount option starts indexing at 0. */ if (n >= 99) return -EINVAL; popt->session = n + 1; break; case Opt_sb: popt->sbsector = result.uint_32; break; case Opt_check: popt->check = result.uint_32; break; case Opt_ignore: break; case Opt_uid: popt->uid = result.uid; popt->uid_set = 1; break; case Opt_gid: popt->gid = result.gid; popt->gid_set = 1; break; case Opt_mode: popt->fmode = result.uint_32; break; case Opt_dmode: popt->dmode = result.uint_32; break; case Opt_overriderockperm: popt->overriderockperm = 1; break; case Opt_block: n = result.uint_32; if (n != 512 && n != 1024 && n != 2048) return -EINVAL; popt->blocksize = n; break; case Opt_nocompress: popt->nocompress = 1; break; default: return -EINVAL; } return 0; } /* * Display the mount options in /proc/mounts. */ static int isofs_show_options(struct seq_file *m, struct dentry *root) { struct isofs_sb_info *sbi = ISOFS_SB(root->d_sb); if (!sbi->s_rock) seq_puts(m, ",norock"); else if (!sbi->s_joliet_level) seq_puts(m, ",nojoliet"); if (sbi->s_cruft) seq_puts(m, ",cruft"); if (sbi->s_hide) seq_puts(m, ",hide"); if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); if (sbi->s_session != 255) seq_printf(m, ",session=%u", sbi->s_session - 1); if (sbi->s_sbsector != -1) seq_printf(m, ",sbsector=%u", sbi->s_sbsector); if (root->d_sb->s_blocksize != 1024) seq_printf(m, ",blocksize=%lu", root->d_sb->s_blocksize); if (sbi->s_uid_set) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid)); if (sbi->s_gid_set) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_dmode != ISOFS_INVALID_MODE) seq_printf(m, ",dmode=%o", sbi->s_dmode); if (sbi->s_fmode != ISOFS_INVALID_MODE) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); else seq_puts(m, ",iocharset=utf8"); #endif return 0; } /* * look if the driver can tell the multi session redirection value * * don't change this if you don't know what you do, please! * Multisession is legal only with XA disks. * A non-XA disk with more than one volume descriptor may do it right, but * usually is written in a nowhere standardized "multi-partition" manner. * Multisession uses absolute addressing (solely the first frame of the whole * track is #0), multi-partition uses relative addressing (each first frame of * each track is #0), and a track is not a session. * * A broken CDwriter software or drive firmware does not set new standards, * at least not if conflicting with the existing ones. * * emoenke@gwdg.de */ #define WE_OBEY_THE_WRITTEN_STANDARDS 1 static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned int vol_desc_start = 0; if (session > 0) { struct cdrom_tocentry te; if (!cdi) return 0; te.cdte_track = session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) == 0) { printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", session, te.cdte_addr.lba, te.cdte_ctrl & CDROM_DATA_TRACK); if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4) return te.cdte_addr.lba; } printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); } if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0) { #if WE_OBEY_THE_WRITTEN_STANDARDS /* necessary for a valid ms_info.addr */ if (ms_info.xa_flag) #endif vol_desc_start = ms_info.addr.lba; } } return vol_desc_start; } /* * Check if root directory is empty (has less than 3 files). * * Used to detect broken CDs where ISO root directory is empty but Joliet root * directory is OK. If such CD has Rock Ridge extensions, they will be disabled * (and Joliet used instead) or else no files would be visible. */ static bool rootdir_empty(struct super_block *sb, unsigned long block) { int offset = 0, files = 0, de_len; struct iso_directory_record *de; struct buffer_head *bh; bh = sb_bread(sb, block); if (!bh) return true; while (files < 3) { de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len == 0) break; files++; offset += de_len; } brelse(bh); return files < 3; } /* * Initialize the superblock and read the root inode. */ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh = NULL, *pri_bh = NULL; struct hs_primary_descriptor *h_pri = NULL; struct iso_primary_descriptor *pri = NULL; struct iso_supplementary_descriptor *sec = NULL; struct iso_directory_record *rootp; struct inode *inode; struct isofs_options *opt = fc->fs_private; struct isofs_sb_info *sbi; unsigned long first_data_zone; int joliet_level = 0; int iso_blknum, block; int orig_zonesize; int table, error = -EINVAL; unsigned int vol_desc_start; int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; /* * First of all, get the hardware blocksize for this device. * If we don't know what it is, or the hardware blocksize is * larger than the blocksize the user specified, then use * that value. */ /* * What if bugger tells us to go beyond page size? */ if (bdev_logical_block_size(s->s_bdev) > 2048) { printk(KERN_WARNING "ISOFS: unsupported/invalid hardware sector size %d\n", bdev_logical_block_size(s->s_bdev)); goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); if (!opt->blocksize) { printk(KERN_ERR "ISOFS: unable to set blocksize\n"); goto out_freesbi; } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; sbi->s_sbsector = opt->sbsector; vol_desc_start = (opt->sbsector != -1) ? opt->sbsector : isofs_get_last_session(s, opt->session); for (iso_blknum = vol_desc_start+16; iso_blknum < vol_desc_start+100; iso_blknum++) { struct hs_volume_descriptor *hdp; struct iso_volume_descriptor *vdp; block = iso_blknum << (ISOFS_BLOCK_BITS - s->s_blocksize_bits); if (!(bh = sb_bread(s, block))) goto out_no_read; vdp = (struct iso_volume_descriptor *)bh->b_data; hdp = (struct hs_volume_descriptor *)bh->b_data; /* * Due to the overlapping physical location of the descriptors, * ISO CDs can match hdp->id==HS_STANDARD_ID as well. To ensure * proper identification in this case, we first check for ISO. */ if (strncmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) == 0) { if (isonum_711(vdp->type) == ISO_VD_END) break; if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { if (!pri) { pri = (struct iso_primary_descriptor *)vdp; /* Save the buffer in case we need it ... */ pri_bh = bh; bh = NULL; } } #ifdef CONFIG_JOLIET else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { sec = (struct iso_supplementary_descriptor *)vdp; if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { if (opt->joliet) { if (sec->escape[2] == 0x40) joliet_level = 1; else if (sec->escape[2] == 0x43) joliet_level = 2; else if (sec->escape[2] == 0x45) joliet_level = 3; printk(KERN_DEBUG "ISO 9660 Extensions: " "Microsoft Joliet Level %d\n", joliet_level); } goto root_found; } else { /* Unknown supplementary volume descriptor */ sec = NULL; } } #endif } else { if (strncmp (hdp->id, HS_STANDARD_ID, sizeof hdp->id) == 0) { if (isonum_711(hdp->type) != ISO_VD_PRIMARY) goto out_freebh; sbi->s_high_sierra = 1; opt->rock = 0; h_pri = (struct hs_primary_descriptor *)vdp; goto root_found; } } /* Just skip any volume descriptors we don't recognize */ brelse(bh); bh = NULL; } /* * If we fall through, either no volume descriptor was found, * or else we passed a primary descriptor looking for others. */ if (!pri) goto out_unknown_format; brelse(bh); bh = pri_bh; pri_bh = NULL; root_found: /* We don't support read-write mounts */ if (!sb_rdonly(s)) { error = -EACCES; goto out_freebh; } if (joliet_level && (!pri || !opt->rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ pri = (struct iso_primary_descriptor *) sec; } if(sbi->s_high_sierra){ rootp = (struct iso_directory_record *) h_pri->root_directory_record; sbi->s_nzones = isonum_733(h_pri->volume_space_size); sbi->s_log_zone_size = isonum_723(h_pri->logical_block_size); sbi->s_max_size = isonum_733(h_pri->volume_space_size); } else { if (!pri) goto out_freebh; rootp = (struct iso_directory_record *) pri->root_directory_record; sbi->s_nzones = isonum_733(pri->volume_space_size); sbi->s_log_zone_size = isonum_723(pri->logical_block_size); sbi->s_max_size = isonum_733(pri->volume_space_size); } sbi->s_ninodes = 0; /* No way to figure this out easily */ orig_zonesize = sbi->s_log_zone_size; /* * If the zone size is smaller than the hardware sector size, * this is a fatal error. This would occur if the disc drive * had sectors that were 2048 bytes, but the filesystem had * blocks that were 512 bytes (which should only very rarely * happen.) */ if (orig_zonesize < opt->blocksize) goto out_bad_size; /* RDE: convert log zone size to bit shift */ switch (sbi->s_log_zone_size) { case 512: sbi->s_log_zone_size = 9; break; case 1024: sbi->s_log_zone_size = 10; break; case 2048: sbi->s_log_zone_size = 11; break; default: goto out_bad_zone_size; } s->s_magic = ISOFS_SUPER_MAGIC; /* * With multi-extent files, file size is only limited by the maximum * size of a file system, which is 8 TB. */ s->s_maxbytes = 0x80000000000LL; /* ECMA-119 timestamp from 1900/1/1 with tz offset */ s->s_time_min = mktime64(1900, 1, 1, 0, 0, 0) - MAX_TZ_OFFSET; s->s_time_max = mktime64(U8_MAX+1900, 12, 31, 23, 59, 59) + MAX_TZ_OFFSET; /* Set this for reference. Its not currently used except on write which we don't have .. */ first_data_zone = isonum_733(rootp->extent) + isonum_711(rootp->ext_attr_length); sbi->s_firstdatazone = first_data_zone; #ifndef BEQUIET printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", sbi->s_max_size, 1UL << sbi->s_log_zone_size); printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); if(sbi->s_high_sierra) printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); #endif /* * If the Joliet level is set, we _may_ decide to use the * secondary descriptor, but can't be sure until after we * read the root inode. But before reading the root inode * we may need to change the device blocksize, and would * rather release the old buffer first. So, we cache the * first_data_zone value from the secondary descriptor. */ if (joliet_level) { pri = (struct iso_primary_descriptor *) sec; rootp = (struct iso_directory_record *) pri->root_directory_record; first_data_zone = isonum_733(rootp->extent) + isonum_711(rootp->ext_attr_length); } /* * We're all done using the volume descriptor, and may need * to change the device blocksize, so release the buffer now. */ brelse(pri_bh); brelse(bh); /* * Force the blocksize to 512 for 512 byte sectors. The file * read primitives really get it wrong in a bad way if we don't * do this. * * Note - we should never be setting the blocksize to something * less than the hardware sector size for the device. If we * do, we would end up having to read larger buffers and split * out portions to satisfy requests. * * Note2- the idea here is that we want to deal with the optimal * zonesize in the filesystem. If we have it set to something less, * then we have horrible problems with trying to piece together * bits of adjacent blocks in order to properly read directory * entries. By forcing the blocksize in this way, we ensure * that we will never be required to do this. */ sb_set_blocksize(s, orig_zonesize); sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET if (joliet_level) { char *p = opt->iocharset ? opt->iocharset : CONFIG_NLS_DEFAULT; if (strcmp(p, "utf8") != 0) { sbi->s_nls_iocharset = opt->iocharset ? load_nls(opt->iocharset) : load_nls_default(); if (!sbi->s_nls_iocharset) goto out_freesbi; } } #endif s->s_op = &isofs_sops; s->s_export_op = &isofs_export_ops; sbi->s_mapping = opt->map; sbi->s_rock = (opt->rock ? 2 : 0); sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ sbi->s_cruft = opt->cruft; sbi->s_hide = opt->hide; sbi->s_showassoc = opt->showassoc; sbi->s_uid = opt->uid; sbi->s_gid = opt->gid; sbi->s_uid_set = opt->uid_set; sbi->s_gid_set = opt->gid_set; sbi->s_nocompress = opt->nocompress; sbi->s_overriderockperm = opt->overriderockperm; /* * It would be incredibly stupid to allow people to mark every file * on the disk as suid, so we merely allow them to set the default * permissions. */ if (opt->fmode != ISOFS_INVALID_MODE) sbi->s_fmode = opt->fmode & 0777; else sbi->s_fmode = ISOFS_INVALID_MODE; if (opt->dmode != ISOFS_INVALID_MODE) sbi->s_dmode = opt->dmode & 0777; else sbi->s_dmode = ISOFS_INVALID_MODE; /* * Read the root inode, which _may_ result in changing * the s_rock flag. Once we have the final s_rock value, * we then decide whether to use the Joliet descriptor. */ inode = isofs_iget(s, sbi->s_firstdatazone, 0); /* * Fix for broken CDs with a corrupt root inode but a correct Joliet * root directory. */ if (IS_ERR(inode)) { if (joliet_level && sbi->s_firstdatazone != first_data_zone) { printk(KERN_NOTICE "ISOFS: root inode is unusable. " "Disabling Rock Ridge and switching to Joliet."); sbi->s_rock = 0; inode = NULL; } else { goto out_no_root; } } /* * Fix for broken CDs with Rock Ridge and empty ISO root directory but * correct Joliet root directory. */ if (sbi->s_rock == 1 && joliet_level && rootdir_empty(s, sbi->s_firstdatazone)) { printk(KERN_NOTICE "ISOFS: primary root directory is empty. " "Disabling Rock Ridge and switching to Joliet."); sbi->s_rock = 0; } /* * If this disk has both Rock Ridge and Joliet on it, then we * want to use Rock Ridge by default. This can be overridden * by using the norock mount option. There is still one other * possibility that is not taken into account: a Rock Ridge * CD with Unicode names. Until someone sees such a beast, it * will not be supported. */ if (sbi->s_rock == 1) { joliet_level = 0; } else if (joliet_level) { sbi->s_rock = 0; if (sbi->s_firstdatazone != first_data_zone) { sbi->s_firstdatazone = first_data_zone; printk(KERN_DEBUG "ISOFS: changing to secondary root\n"); iput(inode); inode = isofs_iget(s, sbi->s_firstdatazone, 0); if (IS_ERR(inode)) goto out_no_root; } } if (opt->check == 'u') { /* Only Joliet is case insensitive by default */ if (joliet_level) opt->check = 'r'; else opt->check = 's'; } sbi->s_joliet_level = joliet_level; /* Make sure the root inode is a directory */ if (!S_ISDIR(inode->i_mode)) { printk(KERN_WARNING "isofs_fill_super: root inode is not a directory. " "Corrupted media?\n"); goto out_iput; } table = 0; if (joliet_level) table += 2; if (opt->check == 'r') table++; sbi->s_check = opt->check; if (table) set_default_d_op(s, &isofs_dentry_ops[table - 1]); /* get the root dentry */ s->s_root = d_make_root(inode); if (!(s->s_root)) { error = -ENOMEM; goto out_no_inode; } return 0; /* * Display error messages and free resources. */ out_iput: iput(inode); goto out_no_inode; out_no_root: error = PTR_ERR(inode); if (error != -ENOMEM) printk(KERN_WARNING "%s: get root inode failed\n", __func__); out_no_inode: #ifdef CONFIG_JOLIET unload_nls(sbi->s_nls_iocharset); #endif goto out_freesbi; out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); goto out_freebh; out_bad_size: printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", orig_zonesize, opt->blocksize); goto out_freebh; out_unknown_format: if (!silent) printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); out_freebh: brelse(bh); brelse(pri_bh); out_freesbi: kfree(sbi); s->s_fs_info = NULL; return error; } static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (ISOFS_SB(sb)->s_nzones << (ISOFS_SB(sb)->s_log_zone_size - sb->s_blocksize_bits)); buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = NAME_MAX; return 0; } /* * Get a set of blocks; filling in buffer_heads if already allocated * or getblk() if they are not. Returns the number of blocks inserted * (-ve == error.) */ int isofs_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head **bh, unsigned long nblocks) { unsigned long b_off = iblock; unsigned offset, sect_size; unsigned int firstext; unsigned long nextblk, nextoff; int section, rv, error; struct iso_inode_info *ei = ISOFS_I(inode); error = -EIO; rv = 0; if (iblock != b_off) { printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } offset = 0; firstext = ei->i_first_extent; sect_size = ei->i_section_size >> ISOFS_BUFFER_BITS(inode); nextblk = ei->i_next_section_block; nextoff = ei->i_next_section_offset; section = 0; while (nblocks) { /* If we are *way* beyond the end of the file, print a message. * Access beyond the end of the file up to the next page boundary * is normal, however because of the way the page cache works. * In this case, we just return 0 so that we can properly fill * the page with useless information without generating any * I/O errors. */ if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", __func__, b_off, (unsigned long long)inode->i_size); goto abort; } /* On the last section, nextblk == 0, section size is likely to * exceed sect_size by a partial block, and access beyond the * end of the file will reach beyond the section size, too. */ while (nextblk && (b_off >= (offset + sect_size))) { struct inode *ninode; offset += sect_size; ninode = isofs_iget(inode->i_sb, nextblk, nextoff); if (IS_ERR(ninode)) { error = PTR_ERR(ninode); goto abort; } firstext = ISOFS_I(ninode)->i_first_extent; sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); nextblk = ISOFS_I(ninode)->i_next_section_block; nextoff = ISOFS_I(ninode)->i_next_section_offset; iput(ninode); if (++section > 100) { printk(KERN_DEBUG "%s: More than 100 file sections ?!?" " aborting...\n", __func__); printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " "nextblk=%lu nextoff=%lu\n", __func__, b_off, firstext, (unsigned) sect_size, nextblk, nextoff); goto abort; } } if (*bh) { map_bh(*bh, inode->i_sb, firstext + b_off - offset); } else { *bh = sb_getblk(inode->i_sb, firstext+b_off-offset); if (!*bh) goto abort; } bh++; /* Next buffer head */ b_off++; /* Next buffer offset */ nblocks--; rv++; } error = 0; abort: return rv != 0 ? rv : error; } /* * Used by the standard interfaces. */ static int isofs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; if (create) { printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); return -EROFS; } ret = isofs_get_blocks(inode, iblock, &bh_result, 1); return ret < 0 ? ret : 0; } static int isofs_bmap(struct inode *inode, sector_t block) { struct buffer_head dummy; int error; dummy.b_state = 0; dummy.b_blocknr = -1000; error = isofs_get_block(inode, block, &dummy, 0); if (!error) return dummy.b_blocknr; return 0; } struct buffer_head *isofs_bread(struct inode *inode, sector_t block) { sector_t blknr = isofs_bmap(inode, block); if (!blknr) return NULL; return sb_bread(inode->i_sb, blknr); } static int isofs_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, isofs_get_block); } static void isofs_readahead(struct readahead_control *rac) { mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,isofs_get_block); } static const struct address_space_operations isofs_aops = { .read_folio = isofs_read_folio, .readahead = isofs_readahead, .bmap = _isofs_bmap }; static int isofs_read_level3_size(struct inode *inode) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; struct buffer_head *bh = NULL; unsigned long block, offset, block_saved, offset_saved; int i = 0; int more_entries = 0; struct iso_directory_record *tmpde = NULL; struct iso_inode_info *ei = ISOFS_I(inode); inode->i_size = 0; /* The first 16 blocks are reserved as the System Area. Thus, * no inodes can appear in block 0. We use this to flag that * this is the last section. */ ei->i_next_section_block = 0; ei->i_next_section_offset = 0; block = ei->i_iget5_block; offset = ei->i_iget5_offset; do { struct iso_directory_record *de; unsigned int de_len; if (!bh) { bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; } de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len == 0) { brelse(bh); bh = NULL; ++block; offset = 0; continue; } block_saved = block; offset_saved = offset; offset += de_len; /* Make sure we have a full directory entry */ if (offset >= bufsize) { int slop = bufsize - offset + de_len; if (!tmpde) { tmpde = kmalloc(256, GFP_KERNEL); if (!tmpde) goto out_nomem; } memcpy(tmpde, de, slop); offset &= bufsize - 1; block++; brelse(bh); bh = NULL; if (offset) { bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; memcpy((void *)tmpde+slop, bh->b_data, offset); } de = tmpde; } inode->i_size += isonum_733(de->size); if (i == 1) { ei->i_next_section_block = block_saved; ei->i_next_section_offset = offset_saved; } more_entries = de->flags[-high_sierra] & 0x80; i++; if (i > 100) goto out_toomany; } while (more_entries); out: kfree(tmpde); brelse(bh); return 0; out_nomem: brelse(bh); return -ENOMEM; out_noread: printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block); kfree(tmpde); return -EIO; out_toomany: printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" "isofs_read_level3_size: inode=%lu\n", __func__, inode->i_ino); goto out; } static int isofs_read_inode(struct inode *inode, int relocated) { struct super_block *sb = inode->i_sb; struct isofs_sb_info *sbi = ISOFS_SB(sb); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned long block; int high_sierra = sbi->s_high_sierra; struct buffer_head *bh; struct iso_directory_record *de; struct iso_directory_record *tmpde = NULL; unsigned int de_len; unsigned long offset; struct iso_inode_info *ei = ISOFS_I(inode); int ret = -EIO; struct timespec64 ts; block = ei->i_iget5_block; bh = sb_bread(inode->i_sb, block); if (!bh) goto out_badread; offset = ei->i_iget5_offset; de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; if (de_len < sizeof(struct iso_directory_record)) goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; tmpde = kmalloc(de_len, GFP_KERNEL); if (!tmpde) { ret = -ENOMEM; goto fail; } memcpy(tmpde, bh->b_data + offset, frag1); brelse(bh); bh = sb_bread(inode->i_sb, ++block); if (!bh) goto out_badread; memcpy((char *)tmpde+frag1, bh->b_data, de_len - frag1); de = tmpde; } inode->i_ino = isofs_get_ino(ei->i_iget5_block, ei->i_iget5_offset, ISOFS_BUFFER_BITS(inode)); /* Assume it is a normal-format file unless told otherwise */ ei->i_file_format = isofs_file_normal; if (de->flags[-high_sierra] & 2) { if (sbi->s_dmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFDIR | sbi->s_dmode; else inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; set_nlink(inode, 1); /* * Set to 1. We know there are 2, but * the find utility tries to optimize * if it is 2, and it screws up. It is * easier to give 1 which tells find to * do it the hard way. */ } else { if (sbi->s_fmode != ISOFS_INVALID_MODE) { inode->i_mode = S_IFREG | sbi->s_fmode; } else { /* * Set default permissions: r-x for all. The disc * could be shared with DOS machines so virtually * anything could be a valid executable. */ inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; } set_nlink(inode, 1); } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; inode->i_blocks = 0; ei->i_format_parm[0] = 0; ei->i_format_parm[1] = 0; ei->i_format_parm[2] = 0; ei->i_section_size = isonum_733(de->size); if (de->flags[-high_sierra] & 0x80) { ret = isofs_read_level3_size(inode); if (ret < 0) goto fail; ret = -EIO; } else { ei->i_next_section_block = 0; ei->i_next_section_offset = 0; inode->i_size = isonum_733(de->size); } /* * Some dipshit decided to store some other bit of information * in the high byte of the file length. Truncate size in case * this CDROM was mounted with the cruft option. */ if (sbi->s_cruft) inode->i_size &= 0x00ffffff; if (de->interleave[0]) { printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); inode->i_size = 0; } /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", inode->i_ino); } /* I have no idea what other flag bits are used for, so we will flag it for now */ #ifdef DEBUG if((de->flags[-high_sierra] & ~2)!= 0){ printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " "(%ld %x).\n", inode->i_ino, de->flags[-high_sierra]); } #endif ts = iso_date(de->date, high_sierra ? ISO_DATE_HIGH_SIERRA : 0); inode_set_ctime_to_ts(inode, ts); inode_set_atime_to_ts(inode, ts); inode_set_mtime_to_ts(inode, ts); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); /* Set the number of blocks for stat() - should be done before RR */ inode->i_blocks = (inode->i_size + 511) >> 9; /* * Now test for possible Rock Ridge extensions which will override * some of these numbers in the inode structure. */ if (!high_sierra) { parse_rock_ridge_inode(de, inode, relocated); /* if we want uid/gid set, override the rock ridge setting */ if (sbi->s_uid_set) inode->i_uid = sbi->s_uid; if (sbi->s_gid_set) inode->i_gid = sbi->s_gid; } /* Now set final access rights if overriding rock ridge setting */ if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm && sbi->s_dmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFDIR | sbi->s_dmode; if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm && sbi->s_fmode != ISOFS_INVALID_MODE) inode->i_mode = S_IFREG | sbi->s_fmode; /* Install the inode operations vector */ if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; switch (ei->i_file_format) { #ifdef CONFIG_ZISOFS case isofs_file_compressed: inode->i_data.a_ops = &zisofs_aops; break; #endif default: inode->i_data.a_ops = &isofs_aops; break; } } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &isofs_dir_inode_operations; inode->i_fop = &isofs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_data.a_ops = &isofs_symlink_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n", inode->i_mode, inode->i_ino); ret = -EIO; goto fail; } ret = 0; out: kfree(tmpde); brelse(bh); return ret; out_badread: printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); fail: goto out; } struct isofs_iget5_callback_data { unsigned long block; unsigned long offset; }; static int isofs_iget5_test(struct inode *ino, void *data) { struct iso_inode_info *i = ISOFS_I(ino); struct isofs_iget5_callback_data *d = (struct isofs_iget5_callback_data*)data; return (i->i_iget5_block == d->block) && (i->i_iget5_offset == d->offset); } static int isofs_iget5_set(struct inode *ino, void *data) { struct iso_inode_info *i = ISOFS_I(ino); struct isofs_iget5_callback_data *d = (struct isofs_iget5_callback_data*)data; i->i_iget5_block = d->block; i->i_iget5_offset = d->offset; return 0; } /* Store, in the inode's containing structure, the block and block * offset that point to the underlying meta-data for the inode. The * code below is otherwise similar to the iget() code in * include/linux/fs.h */ struct inode *__isofs_iget(struct super_block *sb, unsigned long block, unsigned long offset, int relocated) { unsigned long hashval; struct inode *inode; struct isofs_iget5_callback_data data; long ret; if (offset >= 1ul << sb->s_blocksize_bits) return ERR_PTR(-EINVAL); data.block = block; data.offset = offset; hashval = (block << sb->s_blocksize_bits) | offset; inode = iget5_locked(sb, hashval, &isofs_iget5_test, &isofs_iget5_set, &data); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); inode = ERR_PTR(ret); } else { unlock_new_inode(inode); } } return inode; } static int isofs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, isofs_fill_super); } static void isofs_free_fc(struct fs_context *fc) { struct isofs_options *opt = fc->fs_private; kfree(opt->iocharset); kfree(opt); } static const struct fs_context_operations isofs_context_ops = { .parse_param = isofs_parse_param, .get_tree = isofs_get_tree, .reconfigure = isofs_reconfigure, .free = isofs_free_fc, }; static int isofs_init_fs_context(struct fs_context *fc) { struct isofs_options *opt; opt = kzalloc(sizeof(*opt), GFP_KERNEL); if (!opt) return -ENOMEM; opt->map = 'n'; opt->rock = 1; opt->joliet = 1; opt->cruft = 0; opt->hide = 0; opt->showassoc = 0; opt->check = 'u'; /* unset */ opt->nocompress = 0; opt->blocksize = 1024; opt->fmode = opt->dmode = ISOFS_INVALID_MODE; opt->uid_set = 0; opt->gid_set = 0; opt->gid = GLOBAL_ROOT_GID; opt->uid = GLOBAL_ROOT_UID; opt->iocharset = NULL; opt->overriderockperm = 0; opt->session = -1; opt->sbsector = -1; fc->fs_private = opt; fc->ops = &isofs_context_ops; return 0; } static struct file_system_type iso9660_fs_type = { .owner = THIS_MODULE, .name = "iso9660", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = isofs_init_fs_context, .parameters = isofs_param_spec, }; MODULE_ALIAS_FS("iso9660"); MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { int err = init_inodecache(); if (err) goto out; #ifdef CONFIG_ZISOFS err = zisofs_init(); if (err) goto out1; #endif err = register_filesystem(&iso9660_fs_type); if (err) goto out2; return 0; out2: #ifdef CONFIG_ZISOFS zisofs_cleanup(); out1: #endif destroy_inodecache(); out: return err; } static void __exit exit_iso9660_fs(void) { unregister_filesystem(&iso9660_fs_type); #ifdef CONFIG_ZISOFS zisofs_cleanup(); #endif destroy_inodecache(); } module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_DESCRIPTION("ISO 9660 CDROM file system support"); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FPU_XCR_H #define _ASM_X86_FPU_XCR_H #define XCR_XFEATURE_ENABLED_MASK 0x00000000 #define XCR_XFEATURE_IN_USE_MASK 0x00000001 static __always_inline u64 xgetbv(u32 index) { u32 eax, edx; asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); return eax + ((u64)edx << 32); } static inline void xsetbv(u32 index, u64 value) { u32 eax = value; u32 edx = value >> 32; asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } /* * Return a mask of xfeatures which are currently being tracked * by the processor as being in the initial configuration. * * Callers should check X86_FEATURE_XGETBV1. */ static __always_inline u64 xfeatures_in_use(void) { return xgetbv(XCR_XFEATURE_IN_USE_MASK); } #endif /* _ASM_X86_FPU_XCR_H */ |
| 9 2 1 2 3 1 4 7 2 3 2 1 1 3 2 1 1 2 16 8 7 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 Laura Garcia <nevola@gmail.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <linux/jhash.h> struct nft_jhash { u8 sreg; u8 dreg; u8 len; bool autogen_seed:1; u32 modulus; u32 seed; u32 offset; }; static void nft_jhash_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_jhash *priv = nft_expr_priv(expr); const void *data = ®s->data[priv->sreg]; u32 h; h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); regs->data[priv->dreg] = h + priv->offset; } struct nft_symhash { u8 dreg; u32 modulus; u32 offset; }; static void nft_symhash_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_symhash *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; u32 h; h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb), priv->modulus); regs->data[priv->dreg] = h + priv->offset; } static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, [NFTA_HASH_TYPE] = { .type = NLA_U32 }, }; static int nft_jhash_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_jhash *priv = nft_expr_priv(expr); u32 len; int err; if (!tb[NFTA_HASH_SREG] || !tb[NFTA_HASH_DREG] || !tb[NFTA_HASH_LEN] || !tb[NFTA_HASH_MODULUS]) return -EINVAL; if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len); if (err < 0) return err; if (len == 0) return -ERANGE; priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len); if (err < 0) return err; priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; if (tb[NFTA_HASH_SEED]) { priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); } else { priv->autogen_seed = true; get_random_bytes(&priv->seed, sizeof(priv->seed)); } return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_symhash_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_symhash *priv = nft_expr_priv(expr); if (!tb[NFTA_HASH_DREG] || !tb[NFTA_HASH_MODULUS]) return -EINVAL; if (tb[NFTA_HASH_OFFSET]) priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); if (priv->modulus < 1) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) return -EOVERFLOW; return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, sizeof(u32)); } static int nft_jhash_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_jhash *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg)) goto nla_put_failure; if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) goto nla_put_failure; if (!priv->autogen_seed && nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) goto nla_put_failure; if (priv->offset != 0) if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_JENKINS))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_jhash_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_jhash *priv = nft_expr_priv(expr); nft_reg_track_cancel(track, priv->dreg, sizeof(u32)); return false; } static int nft_symhash_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_symhash *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) goto nla_put_failure; if (priv->offset != 0) if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_SYM))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_symhash_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_symhash *priv = nft_expr_priv(expr); struct nft_symhash *symhash; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); return false; } symhash = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->offset != symhash->offset || priv->modulus != symhash->modulus) { nft_reg_track_update(track, expr, priv->dreg, sizeof(u32)); return false; } if (!track->regs[priv->dreg].bitwise) return true; return false; } static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)), .eval = nft_jhash_eval, .init = nft_jhash_init, .dump = nft_jhash_dump, .reduce = nft_jhash_reduce, }; static const struct nft_expr_ops nft_symhash_ops = { .type = &nft_hash_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), .eval = nft_symhash_eval, .init = nft_symhash_init, .dump = nft_symhash_dump, .reduce = nft_symhash_reduce, }; static const struct nft_expr_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 type; if (!tb[NFTA_HASH_TYPE]) return &nft_jhash_ops; type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); switch (type) { case NFT_HASH_SYM: return &nft_symhash_ops; case NFT_HASH_JENKINS: return &nft_jhash_ops; default: break; } return ERR_PTR(-EOPNOTSUPP); } static struct nft_expr_type nft_hash_type __read_mostly = { .name = "hash", .select_ops = nft_hash_select_ops, .policy = nft_hash_policy, .maxattr = NFTA_HASH_MAX, .owner = THIS_MODULE, }; static int __init nft_hash_module_init(void) { return nft_register_expr(&nft_hash_type); } static void __exit nft_hash_module_exit(void) { nft_unregister_expr(&nft_hash_type); } module_init(nft_hash_module_init); module_exit(nft_hash_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); MODULE_ALIAS_NFT_EXPR("hash"); MODULE_DESCRIPTION("Netfilter nftables hash module"); |
| 1 2 1 1 1 16 4 1 13 5 13 4 22 23 1 22 22 10 13 4 2 1 2 3 11 1 2 8 1 2 1 1 1 1 1 1 1 1 1 1 4 1 1 2 5 1 3 1 713 13 713 713 713 712 79 78 10 77 7 71 6 66 3 3 41 1 1 6 6 6 1 2 4 4 2 3 1 1 2 2 1 2 3 3 9 8 4 7 2 2 3 3 3 4 4 4 14 1 6 5 7 11 6 5 11 11 13 10 10 3 1 7 5 2 3 10 10 3 1 7 6 1 5 8 2 1 3 1 3 7 1 6 3 1 2 4 1 3 1 7 7 2 4 4 2 1 1 2 45 45 42 3 3 3 3 851 714 16 801 7 44 41 43 798 796 190 3 1 3 361 698 699 3 698 741 106 13 100 100 14 1 5 5 1 45 45 15 1 14 3 3 3 15 66 3 63 48 48 2 48 38 35 66 9 4 3 2 2 1 5 4 1 3 4 1 656 131 608 129 608 2 658 657 4 623 623 428 648 653 23 1 17 5 8 9 22 41 41 29 2 27 9 18 120 1 1 1 34 109 8 41 5 5 5 10 3 1 7 7 8 4 2 2 2 2 1 1 1 4 5 1 4 4 4 1 1 1 3 1 7 1 6 6 1 6 6 1 27 27 2 2 12 17 4 14 648 149 42 42 87 55 32 2 32 32 11 11 7 10 10 16 1 6 3 7 2 7 8 10 7 4 3 1 2 2 3 131 149 148 149 8 139 2 3 136 139 149 13 3 2 3 14 6 9 10 26 26 15 10 10 8 1 1 11 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps. */ #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/task_io_accounting_ops.h> #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> #include <linux/nsproxy.h> #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> #include <linux/ksm.h> #include <uapi/linux/lsm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" #include "../../lib/kstrtox.h" /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable. */ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; enum proc_mem_force { PROC_MEM_FORCE_ALWAYS, PROC_MEM_FORCE_PTRACE, PROC_MEM_FORCE_NEVER }; static enum proc_mem_force proc_mem_force_override __ro_after_init = IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : PROC_MEM_FORCE_ALWAYS; static const struct constant_table proc_mem_force_table[] __initconst = { { "always", PROC_MEM_FORCE_ALWAYS }, { "ptrace", PROC_MEM_FORCE_PTRACE }, { "never", PROC_MEM_FORCE_NEVER }, { } }; static int __init early_proc_mem_force_override(char *buf) { if (!buf) return -EINVAL; /* * lookup_constant() defaults to proc_mem_force_override to preseve * the initial Kconfig choice in case an invalid param gets passed. */ proc_mem_force_override = lookup_constant(proc_mem_force_table, buf, proc_mem_force_override); return 0; } early_param("proc_mem.force_override", early_proc_mem_force_override); struct pid_entry { const char *name; unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; }; #define NOD(NAME, MODE, IOP, FOP, OP) { \ .name = (NAME), \ .len = sizeof(NAME) - 1, \ .mode = MODE, \ .iop = IOP, \ .fop = FOP, \ .op = OP, \ } #define DIR(NAME, MODE, iops, fops) \ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) #define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ { .proc_get_link = get_link } ) #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) #define ATTR(LSMID, NAME, MODE) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_pid_attr_operations, \ { .lsmid = LSMID }) /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; } return count; } static int get_task_root(struct task_struct *task, struct path *root) { int result = -ENOENT; task_lock(task); if (task->fs) { get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } static int proc_cwd_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { task_lock(task); if (task->fs) { get_fs_pwd(task->fs, path); result = 0; } task_unlock(task); put_task_struct(task); } return result; } static int proc_root_link(struct dentry *dentry, struct path *path) { struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { result = get_task_root(task, path); put_task_struct(task); } return result; } /* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page. */ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, size_t count, unsigned long pos, unsigned long arg_start) { char *page; int ret, got; if (pos >= PAGE_SIZE) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got); /* Include the NUL character if it was found */ if (len < got) len++; if (len > pos) { len -= pos; if (len > count) len = count; len -= copy_to_user(buf, page+pos, len); if (!len) len = -EFAULT; ret = len; } } free_page((unsigned long)page); return ret; } static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, size_t count, loff_t *ppos) { unsigned long arg_start, arg_end, env_start, env_end; unsigned long pos, len; char *page, c; /* Check if process spawned far enough to have cmdline. */ if (!mm->env_end) return 0; spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); if (arg_start >= arg_end) return 0; /* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area. */ if (env_start != arg_end || env_end < env_start) env_start = env_end = arg_end; len = env_end - arg_start; /* We're not going to care if "*ppos" has high bits set */ pos = *ppos; if (pos >= len) return 0; if (count > len - pos) count = len - pos; if (!count) return 0; /* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'. */ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start); /* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range. */ pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos) count = arg_end - pos; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; len = 0; while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break; got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len) len = -EFAULT; break; } pos += got; buf += got; len += got; count -= got; } free_page((unsigned long)page); return len; } static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, size_t count, loff_t *pos) { struct mm_struct *mm; ssize_t ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); return ret; } static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct task_struct *tsk; ssize_t ret; BUG_ON(*pos < 0); tsk = get_proc_task(file_inode(file)); if (!tsk) return -ESRCH; ret = get_task_cmdline(tsk, buf, count, pos); put_task_struct(tsk); if (ret > 0) *pos += ret; return ret; } static const struct file_operations proc_pid_cmdline_ops = { .read = proc_pid_cmdline_read, .llseek = generic_file_llseek, }; #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto print0; wchan = get_wchan(task); if (wchan && !lookup_symbol_name(wchan, symname)) { seq_puts(m, symname); return 0; } print0: seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ static int lock_trace(struct task_struct *task) { int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; } static void unlock_trace(struct task_struct *task) { up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long *entries; int err; /* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root. */ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES; entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; err = lock_trace(task); if (!err) { unsigned int i, nr_entries; nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, 0); for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } kfree(entries); return err; } #endif #ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); return 0; } #endif #ifdef CONFIG_LATENCYTOP static int lstats_show_proc(struct seq_file *m, void *v) { int i; struct inode *inode = m->private; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; seq_printf(m, "%i %li %li", lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; if (!bt) break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); } } put_task_struct(task); return 0; } static int lstats_open(struct inode *inode, struct file *file) { return single_open(file, lstats_show_proc, inode); } static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; clear_tsk_latency_tracing(task); put_task_struct(task); return count; } static const struct file_operations proc_lstats_operations = { .open = lstats_open, .read = seq_read, .write = lstats_write, .llseek = seq_lseek, .release = single_release, }; #endif static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; long badness; badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it. */ if (badness != LONG_MIN) points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; seq_printf(m, "%lu\n", points); return 0; } struct limit_names { const char *name; const char *unit; }; static const struct limit_names lnames[RLIM_NLIMITS] = { [RLIMIT_CPU] = {"Max cpu time", "seconds"}, [RLIMIT_FSIZE] = {"Max file size", "bytes"}, [RLIMIT_DATA] = {"Max data size", "bytes"}, [RLIMIT_STACK] = {"Max stack size", "bytes"}, [RLIMIT_CORE] = {"Max core file size", "bytes"}, [RLIMIT_RSS] = {"Max resident set", "bytes"}, [RLIMIT_NPROC] = {"Max processes", "processes"}, [RLIMIT_NOFILE] = {"Max open files", "files"}, [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, [RLIMIT_AS] = {"Max address space", "bytes"}, [RLIMIT_LOCKS] = {"Max file locks", "locks"}, [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, [RLIMIT_NICE] = {"Max nice priority", NULL}, [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, }; /* Display limits for a process */ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned int i; unsigned long flags; struct rlimit rlim[RLIM_NLIMITS]; if (!lock_task_sighand(task, &flags)) return 0; memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); /* * print the file header */ seq_puts(m, "Limit " "Soft Limit " "Hard Limit " "Units \n"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); else seq_printf(m, "%-20lu ", rlim[i].rlim_max); if (lnames[i].unit) seq_printf(m, "%-10s\n", lnames[i].unit); else seq_putc(m, '\n'); } return 0; } #ifdef CONFIG_HAVE_ARCH_TRACEHOOK static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct syscall_info info; u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); else if (info.data.nr < 0) seq_printf(m, "%d 0x%llx 0x%llx\n", info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ /* permission checks */ static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. */ task = get_proc_task(inode); if (task) { allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; } int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } /* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)? */ static bool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min) { /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); if (fs_info->hide_pid < hide_pid_min) return true; if (in_group_p(fs_info->pid_gid)) return true; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } static int proc_pid_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; bool has_perms; task = get_proc_task(inode); if (!task) return -ESRCH; has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all. */ return -ENOENT; } return -EPERM; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); put_task_struct(task); return ret; } static int proc_single_open(struct inode *inode, struct file *filp) { return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { .open = proc_single_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; /* * proc_mem_open() can return errno, NULL or mm_struct*. * * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) * - Returns mm_struct* on success * - Returns error code on failure */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); struct mm_struct *mm; if (!task) return ERR_PTR(-ESRCH); mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (IS_ERR(mm)) return mm == ERR_PTR(-ESRCH) ? NULL : mm; /* ensure this mm_struct can't be freed */ mmgrab(mm); /* but do not pin its memory */ mmput(mm); return mm; } static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR_OR_NULL(mm)) return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } static int mem_open(struct inode *inode, struct file *file) { if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) return -EINVAL; return __mem_open(inode, file, PTRACE_MODE_ATTACH); } static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) { struct task_struct *task; bool ptrace_active = false; switch (proc_mem_force_override) { case PROC_MEM_FORCE_NEVER: return false; case PROC_MEM_FORCE_PTRACE: task = get_proc_task(file_inode(file)); if (task) { ptrace_active = READ_ONCE(task->ptrace) && READ_ONCE(task->mm) == mm && READ_ONCE(task->parent) == current; put_task_struct(task); } return ptrace_active; default: return true; } } static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { struct mm_struct *mm = file->private_data; unsigned long addr = *ppos; ssize_t copied; char *page; unsigned int flags; if (!mm) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; copied = 0; if (!mmget_not_zero(mm)) goto free; flags = write ? FOLL_WRITE : 0; if (proc_mem_foll_force(file, mm)) flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; break; } if (!write && copy_to_user(buf, page, this_len)) { copied = -EFAULT; break; } buf += this_len; addr += this_len; copied += this_len; count -= this_len; } *ppos = addr; mmput(mm); free: free_page((unsigned long) page); return copied; } static ssize_t mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, buf, count, ppos, 0); } static ssize_t mem_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return mem_rw(file, (char __user*)buf, count, ppos, 1); } loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: file->f_pos = offset; break; case 1: file->f_pos += offset; break; default: return -EINVAL; } force_successful_syscall_return(); return file->f_pos; } static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, .release = mem_release, .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ); } static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { char *page; unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; unsigned long env_start, env_end; /* Ensure the process spawned far enough to have an environment. */ if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; ret = 0; if (!mmget_not_zero(mm)) goto free; spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; int retval; if (src >= (env_end - env_start)) break; this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; break; } if (copy_to_user(buf, page, retval)) { ret = -EFAULT; break; } ret += retval; src += retval; buf += retval; count -= retval; } *ppos = src; mmput(mm); free: free_page((unsigned long) page); return ret; } static const struct file_operations proc_environ_operations = { .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, .release = mem_release, }; static int auxv_open(struct inode *inode, struct file *file) { return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); } static ssize_t auxv_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; if (!mm) return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); } static const struct file_operations proc_auxv_operations = { .open = auxv_open, .read = auxv_read, .llseek = generic_file_llseek, .release = mem_release, }; static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; if (!task) return -ESRCH; if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) oom_adj = OOM_ADJUST_MAX; else oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); if (oom_adj > OOM_ADJUST_MAX) oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead. */ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); } else { if ((short)oom_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_unlock; } } /* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock */ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task); if (p) { if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } task_unlock(p); } } task->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = (short)oom_adj; trace_oom_score_adj_update(task); if (mm) { struct task_struct *p; rcu_read_lock(); for_each_process(p) { if (same_thread_group(task, p)) continue; /* do not touch kernel threads or the global init */ if (p->flags & PF_KTHREAD || is_global_init(p)) continue; task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; } task_unlock(p); } rcu_read_unlock(); mmdrop(mm); } err_unlock: mutex_unlock(&oom_adj_mutex); put_task_struct(task); return err; } /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it. */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_adj); if (err) goto out; if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && oom_adj != OOM_DISABLE) { err = -EINVAL; goto out; } /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. */ if (oom_adj == OOM_ADJUST_MAX) oom_adj = OOM_SCORE_ADJ_MAX; else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_adj_operations = { .read = oom_adj_read, .write = oom_adj_write, .llseek = generic_file_llseek, }; static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; size_t len; if (!task) return -ESRCH; oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { err = -EFAULT; goto out; } err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || oom_score_adj > OOM_SCORE_ADJ_MAX) { err = -EINVAL; goto out; } err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } static const struct file_operations proc_oom_score_adj_operations = { .read = oom_score_adj_read, .write = oom_score_adj_write, .llseek = default_llseek, }; #ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", from_kuid(file->f_cred->user_ns, audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); uid_t loginuid; kuid_t kloginuid; int rv; /* Don't let kthreads write their own loginuid */ if (current->flags & PF_KTHREAD) return -EPERM; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); if (*ppos != 0) { /* No partial writes. */ return -EINVAL; } rv = kstrtou32_from_user(buf, count, 10, &loginuid); if (rv < 0) return rv; /* is userspace tring to explicitly UNSET the loginuid? */ if (loginuid == AUDIT_UID_UNSET) { kloginuid = INVALID_UID; } else { kloginuid = make_kuid(file->f_cred->user_ns, loginuid); if (!uid_valid(kloginuid)) return -EINVAL; } rv = audit_set_loginuid(kloginuid); if (rv < 0) return rv; return count; } static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_sessionid(task)); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_FAULT_INJECTION static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; if (!task) return -ESRCH; make_it_fail = task->make_it_fail; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); return simple_read_from_buffer(buf, count, ppos, buffer, len); } static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); if (rv < 0) return rv; if (make_it_fail < 0 || make_it_fail > 1) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; put_task_struct(task); return count; } static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; int err; unsigned int n; err = kstrtouint_from_user(buf, count, 0, &n); if (err) return err; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->fail_nth = n; put_task_struct(task); return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char numbuf[PROC_NUMBUF]; ssize_t len; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { .read = proc_fail_nth_read, .write = proc_fail_nth_write, }; #endif /* * Print out various scheduling related per-task fields: */ static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct pid_namespace *ns = proc_pid_ns(inode->i_sb); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_show_task(p, ns, m); put_task_struct(p); return 0; } static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_set_task(p); put_task_struct(p); return count; } static int sched_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_show, inode); } static const struct file_operations proc_pid_sched_operations = { .open = sched_open, .read = seq_read, .write = sched_write, .llseek = seq_lseek, .release = single_release, }; #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: */ static int sched_autogroup_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_sched_autogroup_show_task(p, m); put_task_struct(p); return 0; } static ssize_t sched_autogroup_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[PROC_NUMBUF] = {}; int nice; int err; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; err = kstrtoint(strstrip(buffer), 0, &nice); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; put_task_struct(p); return count; } static int sched_autogroup_open(struct inode *inode, struct file *filp) { int ret; ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; m->private = inode; } return ret; } static const struct file_operations proc_pid_sched_autogroup_operations = { .open = sched_autogroup_open, .read = seq_read, .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_SCHED_AUTOGROUP */ #ifdef CONFIG_TIME_NS static int timens_offsets_show(struct seq_file *m, void *v) { struct task_struct *p; p = get_proc_task(file_inode(m->file)); if (!p) return -ESRCH; proc_timens_show_offsets(p, m); put_task_struct(p); return 0; } static ssize_t timens_offsets_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file_inode(file); struct proc_timens_offset offsets[2]; char *kbuf = NULL, *pos, *next_line; struct task_struct *p; int ret, noffsets; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* Parse the user data */ ret = -EINVAL; noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err; /* Find the end of line and ensure we don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } err = sscanf(pos, "%9s %lld %lu", clock, &off->val.tv_sec, &off->val.tv_nsec); if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) goto out; clock[sizeof(clock) - 1] = 0; if (strcmp(clock, "monotonic") == 0 || strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) off->clockid = CLOCK_MONOTONIC; else if (strcmp(clock, "boottime") == 0 || strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) off->clockid = CLOCK_BOOTTIME; else goto out; noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line) count = next_line - kbuf; break; } } ret = -ESRCH; p = get_proc_task(inode); if (!p) goto out; ret = proc_timens_set_offset(file, p, offsets, noffsets); put_task_struct(p); if (ret) goto out; ret = count; out: kfree(kbuf); return ret; } static int timens_offsets_open(struct inode *inode, struct file *filp) { return single_open(filp, timens_offsets_show, inode); } static const struct file_operations proc_timens_offsets_operations = { .open = timens_offsets_open, .read = seq_read, .write = timens_offsets_write, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_TIME_NS */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); if (!p) return -ESRCH; if (same_thread_group(current, p)) { set_task_comm(p, buffer); proc_comm_connector(p); } else count = -EINVAL; put_task_struct(p); return count; } static int comm_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; proc_task_name(m, p, false); seq_putc(m, '\n'); put_task_struct(p); return 0; } static int comm_open(struct inode *inode, struct file *filp) { return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { .open = comm_open, .read = seq_read, .write = comm_write, .llseek = seq_lseek, .release = single_release, }; static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; exe_file = get_task_exe_file(task); put_task_struct(task); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); fput(exe_file); return 0; } else return -ENOENT; } static const char *proc_pid_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct path path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = nd_jump_link(&path); out: return ERR_PTR(error); } static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: kfree(tmp); return len; } static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; error = do_proc_readlink(&path, buffer, buflen); path_put(&path); out: return error; } const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_pid_get_link, .setattr = proc_setattr, }; /* building an inode */ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a * proc file for a task. */ const struct cred *cred; kuid_t uid; kgid_t gid; if (unlikely(task->flags & PF_KTHREAD)) { *ruid = GLOBAL_ROOT_UID; *rgid = GLOBAL_ROOT_GID; return; } /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); uid = cred->euid; gid = cred->egid; rcu_read_unlock(); /* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories. */ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm; task_lock(task); mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns; uid = make_kuid(user_ns, 0); if (!uid_valid(uid)) uid = GLOBAL_ROOT_UID; gid = make_kgid(user_ns, 0); if (!gid_valid(gid)) gid = GLOBAL_ROOT_GID; } } else { uid = GLOBAL_ROOT_UID; gid = GLOBAL_ROOT_GID; } task_unlock(task); } *ruid = uid; *rgid = gid; } void proc_pid_evict_inode(struct proc_inode *ei) { struct pid *pid = ei->pid; if (S_ISDIR(ei->vfs_inode.i_mode)) { spin_lock(&pid->lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&pid->lock); } } struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; struct pid *pid; /* We need a new inode */ inode = new_inode(sb); if (!inode) goto out; /* Common stuff */ ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ pid = get_task_pid(task, PIDTYPE_PID); if (!pid) goto out_unlock; /* Let the pid remember us for quick removal */ ei->pid = pid; task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); out: return inode; out_unlock: iput(inode); return NULL; } /* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released. */ static struct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode *inode; struct proc_inode *ei; struct pid *pid; inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL; /* Let proc_flush_pid find this directory inode */ ei = PROC_I(inode); pid = ei->pid; spin_lock(&pid->lock); hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); spin_unlock(&pid->lock); return inode; } int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, * it only makes getattr() consistent with readdir(). */ return -ENOENT; } task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); } rcu_read_unlock(); return 0; } /* dentry stuff */ /* * Set <pid>/... inode ownership (can change due to setuid(), etc.) */ void pid_update_inode(struct task_struct *task, struct inode *inode) { task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); } /* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. * */ static int pid_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; int ret = 0; rcu_read_lock(); inode = d_inode_rcu(dentry); if (!inode) goto out; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); ret = 1; } out: rcu_read_unlock(); return ret; } static inline bool proc_inode_is_dead(struct inode *inode) { return !proc_pid(inode)->tasks[PIDTYPE_PID].first; } int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, }; /* Lookups */ /* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN; ino_t ino = 1; child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); if (unlikely(res)) { dput(child); child = res; if (IS_ERR(child)) goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); end_instantiate: return dir_emit(ctx, name, len, ino, type); } /* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses. */ static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { const char *str = dentry->d_name.name; unsigned long long sval, eval; unsigned int len; if (str[0] == '0' && str[1] != '-') return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsigned long)sval) return -EINVAL; str += len; if (*str != '-') return -EINVAL; str++; if (str[0] == '0' && str[1]) return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsigned long)eval) return -EINVAL; str += len; if (*str != '\0') return -EINVAL; *start = sval; *end = eval; return 0; } static int map_files_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; struct mm_struct *mm = NULL; struct task_struct *task; struct inode *inode; int status = 0; if (flags & LOOKUP_RCU) return -ECHILD; inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { status = mmap_read_lock_killable(mm); if (!status) { exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); mmap_read_unlock(mm); } } mmput(mm); if (exact_vma_exists) { task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); status = 1; } out: put_task_struct(task); out_notask: return status; } static const struct dentry_operations tid_map_files_dentry_operations = { .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; int rc; rc = -ENOENT; task = get_proc_task(d_inode(dentry)); if (!task) goto out; mm = get_task_mm(task); put_task_struct(task); if (!mm) goto out; rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); if (rc) goto out_mmput; rc = mmap_read_lock_killable(mm); if (rc) goto out_mmput; rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } mmap_read_unlock(mm); out_mmput: mmput(mm); out: return rc; } struct map_files_info { unsigned long start; unsigned long end; fmode_t mode; }; /* * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due * to concerns about how the symlinks may be used to bypass permissions on * ancestor directories in the path to the file in question. */ static const char * proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM); return proc_pid_get_link(dentry, inode, done); } /* * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, .get_link = proc_map_files_get_link, .setattr = proc_setattr, }; static struct dentry * proc_map_files_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | ((mode & FMODE_READ ) ? S_IRUSR : 0) | ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; return proc_splice_unmountable(inode, dentry, &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; struct dentry *result; struct mm_struct *mm; result = ERR_PTR(-ENOENT); task = get_proc_task(dir); if (!task) goto out; result = ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; result = ERR_PTR(-EINTR); if (mmap_read_lock_killable(mm)) goto out_put_mm; result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; if (vma->vm_file) result = proc_map_files_instantiate(dentry, task, (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(task); out: return result; } static const struct inode_operations proc_map_files_inode_operations = { .lookup = proc_map_files_lookup, .permission = proc_fd_permission, .setattr = proc_setattr, }; static int proc_map_files_readdir(struct file *file, struct dir_context *ctx) { struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; struct vma_iterator vmi; genradix_init(&fa); ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) goto out; ret = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task; mm = get_task_mm(task); if (!mm) goto out_put_task; ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); goto out_put_task; } nr_files = 0; /* * We need two passes here: * * 1) Collect vmas of mapped files with mmap_lock taken * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() * routine might require mmap_lock taken in might_fault(). */ pos = 2; vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) continue; p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) { ret = -ENOMEM; mmap_read_unlock(mm); mmput(mm); goto out_put_task; } p->start = vma->vm_start; p->end = vma->vm_end; p->mode = vma->vm_file->f_mode; } mmap_read_unlock(mm); mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) break; ctx->pos++; } out_put_task: put_task_struct(task); out: genradix_free(&fa); return ret; } static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, .iterate_shared = proc_map_files_readdir, .llseek = generic_file_llseek, }; #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { struct pid *pid; struct task_struct *task; struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) { struct timers_private *tp = m->private; tp->task = get_pid_task(tp->pid, PIDTYPE_PID); if (!tp->task) return ERR_PTR(-ESRCH); rcu_read_lock(); return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; if (tp->task) { put_task_struct(tp->task); tp->task = NULL; rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { static const char * const nstr[] = { [SIGEV_SIGNAL] = "signal", [SIGEV_NONE] = "none", [SIGEV_THREAD] = "thread", }; struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); struct timers_private *tp = m->private; int notify = timer->it_sigev_notify; guard(spinlock_irq)(&timer->it_lock); if (!posixtimer_valid(timer)) return 0; seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } static const struct seq_operations proc_timers_seq_ops = { .start = timers_start, .next = timers_next, .stop = timers_stop, .show = show_timer, }; static int proc_timers_open(struct inode *inode, struct file *file) { struct timers_private *tp; tp = __seq_open_private(file, &proc_timers_seq_ops, sizeof(struct timers_private)); if (!tp) return -ENOMEM; tp->pid = proc_pid(inode); tp->ns = proc_pid_ns(inode->i_sb); return 0; } static const struct file_operations proc_timers_operations = { .open = proc_timers_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; #endif static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct inode *inode = file_inode(file); struct task_struct *p; u64 slack_ns; int err; err = kstrtoull_from_user(buf, count, 10, &slack_ns); if (err < 0) return err; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); count = -EPERM; goto out; } rcu_read_unlock(); err = security_task_setscheduler(p); if (err) { count = err; goto out; } } task_lock(p); if (rt_or_dl_task_policy(p)) slack_ns = 0; else if (slack_ns == 0) slack_ns = p->default_timer_slack_ns; p->timer_slack_ns = slack_ns; task_unlock(p); out: put_task_struct(p); return count; } static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; if (p != current) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; goto out; } rcu_read_unlock(); err = security_task_getscheduler(p); if (err) goto out; } task_lock(p); seq_printf(m, "%llu\n", p->timer_slack_ns); task_unlock(p); out: put_task_struct(p); return err; } static int timerslack_ns_open(struct inode *inode, struct file *filp) { return single_open(filp, timerslack_ns_show, inode); } static const struct file_operations proc_pid_set_timerslack_ns_operations = { .open = timerslack_ns_open, .read = seq_read, .write = timerslack_ns_write, .llseek = seq_lseek, .release = single_release, }; static struct dentry *proc_pident_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, const struct pid_entry *p, const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { res = proc_pident_instantiate(dentry, task, p); break; } } put_task_struct(task); out_no_task: return res; } static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { struct task_struct *task = get_proc_task(file_inode(file)); const struct pid_entry *p; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= nents + 2) goto out; for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; ctx->pos++; } out: put_task_struct(task); return 0; } #ifdef CONFIG_SECURITY static int proc_pid_attr_open(struct inode *inode, struct file *file) { file->private_data = NULL; __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); return 0; } static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); if (!task) return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) length = simple_read_from_buffer(buf, count, ppos, p, length); kfree(p); return length; } static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); struct task_struct *task; void *page; int rv; /* A task may only write when it was the opener. */ if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { rcu_read_unlock(); return -ESRCH; } /* A task may only write its own attributes. */ if (current != task) { rcu_read_unlock(); return -EACCES; } /* Prevent changes to overridden credentials. */ if (current_cred() != current_real_cred()) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ if (*ppos != 0) return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); if (rv < 0) goto out_free; rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: return rv; } static const struct file_operations proc_pid_attr_operations = { .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ struct dir_context *ctx) \ { \ return proc_pident_readdir(filp, ctx, \ LSM##_attr_dir_stuff, \ ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct file_operations proc_##LSM##_attr_dir_ops = { \ .read = generic_read_dir, \ .iterate_shared = proc_##LSM##_attr_dir_iterate, \ .llseek = default_llseek, \ }; \ \ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ struct dentry *dentry, unsigned int flags) \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ .lookup = proc_##LSM##_attr_dir_lookup, \ .getattr = pid_getattr, \ .setattr = proc_setattr, \ } #ifdef CONFIG_SECURITY_SMACK static const struct pid_entry smack_attr_dir_stuff[] = { ATTR(LSM_ID_SMACK, "current", 0666), }; LSM_DIR_OPS(smack); #endif #ifdef CONFIG_SECURITY_APPARMOR static const struct pid_entry apparmor_attr_dir_stuff[] = { ATTR(LSM_ID_APPARMOR, "current", 0666), ATTR(LSM_ID_APPARMOR, "prev", 0444), ATTR(LSM_ID_APPARMOR, "exec", 0666), }; LSM_DIR_OPS(apparmor); #endif static const struct pid_entry attr_dir_stuff[] = { ATTR(LSM_ID_UNDEF, "current", 0666), ATTR(LSM_ID_UNDEF, "prev", 0444), ATTR(LSM_ID_UNDEF, "exec", 0666), ATTR(LSM_ID_UNDEF, "fscreate", 0666), ATTR(LSM_ID_UNDEF, "keycreate", 0666), ATTR(LSM_ID_UNDEF, "sockcreate", 0666), #ifdef CONFIG_SECURITY_SMACK DIR("smack", 0555, proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), #endif #ifdef CONFIG_SECURITY_APPARMOR DIR("apparmor", 0555, proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), #endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_attr_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { .lookup = proc_attr_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; #endif #ifdef CONFIG_ELF_CORE static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; int ret; if (!task) return -ESRCH; ret = 0; mm = get_task_mm(task); if (mm) { unsigned long flags = __mm_flags_get_dumpable(mm); len = snprintf(buffer, sizeof(buffer), "%08lx\n", ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); } put_task_struct(task); return ret; } static ssize_t proc_coredump_filter_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; struct mm_struct *mm; unsigned int val; int ret; int i; unsigned long mask; ret = kstrtouint_from_user(buf, count, 0, &val); if (ret < 0) return ret; ret = -ESRCH; task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; mm = get_task_mm(task); if (!mm) goto out_no_mm; ret = 0; for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); out_no_mm: put_task_struct(task); out_no_task: if (ret < 0) return ret; return count; } static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, .llseek = generic_file_llseek, }; #endif #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { struct task_io_accounting acct; int result; result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; unsigned int seq = 1; unsigned long flags; rcu_read_lock(); do { seq++; /* 2 on the 1st/lockless path, otherwise odd */ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); acct = sig->ioac; __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); } while (need_seqretry(&sig->stats_lock, seq)); done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } else { acct = task->ioac; } seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", (unsigned long long)acct.rchar, (unsigned long long)acct.wchar, (unsigned long long)acct.syscr, (unsigned long long)acct.syscw, (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); result = 0; out_unlock: up_read(&task->signal->exec_update_lock); return result; } static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 0); } static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { return do_io_accounting(task, m, 1); } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; struct seq_file *seq; int ret = -EINVAL; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; ret = seq_open(file, seq_ops); if (ret) goto err_put_ns; seq = file->private_data; seq->private = ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_id_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; put_user_ns(ns); return seq_release(inode, file); } static int proc_uid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_uid_seq_operations); } static int proc_gid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_gid_seq_operations); } static int proc_projid_map_open(struct inode *inode, struct file *file) { return proc_id_map_open(inode, file, &proc_projid_seq_operations); } static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_gid_map_operations = { .open = proc_gid_map_open, .write = proc_gid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static const struct file_operations proc_projid_map_operations = { .open = proc_projid_map_open, .write = proc_projid_map_write, .read = seq_read, .llseek = seq_lseek, .release = proc_id_map_release, }; static int proc_setgroups_open(struct inode *inode, struct file *file) { struct user_namespace *ns = NULL; struct task_struct *task; int ret; ret = -ESRCH; task = get_proc_task(inode); if (task) { rcu_read_lock(); ns = get_user_ns(task_cred_xxx(task, user_ns)); rcu_read_unlock(); put_task_struct(task); } if (!ns) goto err; if (file->f_mode & FMODE_WRITE) { ret = -EACCES; if (!ns_capable(ns, CAP_SYS_ADMIN)) goto err_put_ns; } ret = single_open(file, &proc_setgroups_show, ns); if (ret) goto err_put_ns; return 0; err_put_ns: put_user_ns(ns); err: return ret; } static int proc_setgroups_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; int ret = single_release(inode, file); put_user_ns(ns); return ret; } static const struct file_operations proc_setgroups_operations = { .open = proc_setgroups_open, .write = proc_setgroups_write, .read = seq_read, .llseek = seq_lseek, .release = proc_setgroups_release, }; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { int err = lock_trace(task); if (!err) { seq_printf(m, "%08x\n", task->personality); unlock_trace(task); } return err; } #ifdef CONFIG_LIVEPATCH static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { seq_printf(m, "%d\n", task->patch_state); return 0; } #endif /* CONFIG_LIVEPATCH */ #ifdef CONFIG_KSM static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; mm = get_task_mm(task); if (mm) { seq_printf(m, "%lu\n", mm->ksm_merging_pages); mmput(mm); } return 0; } static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { struct mm_struct *mm; int ret = 0; mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); seq_printf(m, "ksm_merge_any: %s\n", mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); return ret; } seq_printf(m, "ksm_mergeable: %s\n", ksm_process_mergeable(mm) ? "yes" : "no"); mmap_read_unlock(mm); mmput(mm); } return 0; } #endif /* CONFIG_KSM */ #ifdef CONFIG_KSTACK_ERASE_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long prev_depth = THREAD_SIZE - (task->prev_lowest_stack & (THREAD_SIZE - 1)); unsigned long depth = THREAD_SIZE - (task->lowest_stack & (THREAD_SIZE - 1)); seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", prev_depth, depth); return 0; } #endif /* CONFIG_KSTACK_ERASE_METRICS */ /* * Thread groups */ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif #ifdef CONFIG_TIME_NS REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_ELF_CORE REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_KSTACK_ERASE_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tgid_base_readdir, .llseek = generic_file_llseek, }; struct pid *tgid_pidfd_to_pid(const struct file *file) { if (file->f_op != &proc_tgid_base_operations) return ERR_PTR(-EBADF); return proc_pid(file_inode(file)); } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tgid_base_stuff, tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; /** * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. * @pid: pid that should be flushed. * * This function walks a list of inodes (that belong to any proc * filesystem) that are attached to the pid and flushes them from * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be * flushed instead. This routine is provided to flush those useless * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee * that no dcache entries will exist after a process is reaped * it just makes it very unlikely that any will persist. */ void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tgid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; /* Limit procfs to only ptraceable tasks */ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) goto out_put_task; } result = proc_pid_instantiate(dentry, task, NULL); out_put_task: put_task_struct(task); out: return result; } /* * Find the first task with tgid >= tgid * */ struct tgid_iter { unsigned int tgid; struct task_struct *task; }; static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) { struct pid *pid; if (iter.task) put_task_struct(iter.task); rcu_read_lock(); retry: iter.task = NULL; pid = find_ge_pid(iter.tgid, ns); if (pid) { iter.tgid = pid_nr_ns(pid, ns); iter.task = pid_task(pid, PIDTYPE_TGID); if (!iter.task) { iter.tgid += 1; goto retry; } get_task_struct(iter.task); } rcu_read_unlock(); return iter; } #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) return 0; if (pos == TGID_OFFSET - 2) { struct inode *inode = d_inode(fs_info->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { struct inode *inode = d_inode(fs_info->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } iter.tgid = pos - TGID_OFFSET; iter.task = NULL; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; unsigned int len; cond_resched(); if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); return 0; } } ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } /* * proc_tid_comm_permission is a special permission function exclusively * used for the node /proc/<pid>/task/<tid>/comm. * It bypasses generic permission checks in the case where a task of the same * task group attempts to access the node. * The rationale behind this is that glibc and bionic access this node for * cross thread naming (pthread_set/getname_np(!self)). However, if * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, * which locks out the cross thread naming implementation. * This function makes sure that the node is always accessible for members of * same thread group. */ static int proc_tid_comm_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; task = get_proc_task(inode); if (!task) return -ESRCH; is_same_tgroup = same_thread_group(current, task); put_task_struct(task); if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { /* This file (/proc/<pid>/task/<tid>/comm) can always be * read or written by the members of the corresponding * thread group. */ return 0; } return generic_permission(&nop_mnt_idmap, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { .setattr = proc_setattr, .permission = proc_tid_comm_permission, }; /* * Tasks */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), LNK("root", proc_root_link), LNK("exe", proc_exe_link), REG("mounts", S_IRUGO, proc_mounts_operations), REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS ONE("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif #ifdef CONFIG_PROC_CPU_RESCTRL ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), REG("fail-nth", 0644, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif #ifdef CONFIG_SECCOMP_CACHE_DEBUG ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { return proc_pident_readdir(file, ctx, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, tid_base_stuff, tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, .iterate_shared = proc_tid_base_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; inode = proc_pid_make_base_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT); inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags |= S_IMMUTABLE; set_nlink(inode, nlink_tid); pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; struct proc_fs_info *fs_info; struct pid_namespace *ns; struct dentry *result = ERR_PTR(-ENOENT); if (!leader) goto out_no_task; tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; fs_info = proc_sb_info(dentry->d_sb); ns = fs_info->pid_ns; rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; if (!same_thread_group(leader, task)) goto out_drop_task; result = proc_task_instantiate(dentry, task, NULL); out_drop_task: put_task_struct(task); out: put_task_struct(leader); out_no_task: return result; } /* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users * buffer was too small or there was a seek into the middle of the * directory we have more work todo. * * In the case of a short read we start with find_task_by_pid. * * In the case of a seek we start with the leader and walk nr * threads past it. */ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, struct pid_namespace *ns) { struct task_struct *pos, *task; unsigned long nr = f_pos; if (nr != f_pos) /* 32bit overflow? */ return NULL; rcu_read_lock(); task = pid_task(pid, PIDTYPE_PID); if (!task) goto fail; /* Attempt to start with the tid of a thread */ if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ if (nr >= get_nr_threads(task)) goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ for_each_thread(task, pos) { if (!nr--) goto found; } fail: pos = NULL; goto out; found: get_task_struct(pos); out: rcu_read_unlock(); return pos; } /* * Find the next thread in the thread list. * Return NULL if there is an error or no next thread. * * The reference to the input task_struct is released. */ static struct task_struct *next_tid(struct task_struct *start) { struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { pos = __next_thread(start); if (pos) get_task_struct(pos); } rcu_read_unlock(); put_task_struct(start); return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct task_struct *task; struct pid_namespace *ns; int tid; if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) return 0; /* We cache the tgid value that the last readdir call couldn't * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); tid = (int)(intptr_t)file->private_data; file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } } return 0; } static int proc_task_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); put_task_struct(p); } return 0; } /* * proc_task_readdir() set @file->private_data to a positive integer * value, so casting that to u64 is safe. generic_llseek_cookie() will * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is * here to catch any unexpected change in behavior either in * proc_task_readdir() or generic_llseek_cookie(). */ static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) { u64 cookie = (u64)(intptr_t)file->private_data; loff_t off; off = generic_llseek_cookie(file, offset, whence, &cookie); WARN_ON_ONCE(cookie > INT_MAX); file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ return off; } static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) { nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } |
| 8785 7244 3611 8550 8540 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000 |